Advertisement
Chewgum

Untitled

Feb 15th, 2014
53
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Lua 8.29 KB | None | 0 0
  1. -- $Id: utf8.lua 179 2009-04-03 18:10:03Z pasta $
  2. --
  3. -- Provides UTF-8 aware string functions implemented in pure lua:
  4. -- * string.utf8len(s)
  5. -- * string.utf8sub(s, i, j)
  6. -- * string.utf8reverse(s)
  7. --
  8. -- If utf8data.lua (containing the lower<->upper case mappings) is loaded, these
  9. -- additional functions are available:
  10. -- * string.utf8upper(s)
  11. -- * string.utf8lower(s)
  12. --
  13. -- All functions behave as their non UTF-8 aware counterparts with the exception
  14. -- that UTF-8 characters are used instead of bytes for all units.
  15.  
  16. --[[
  17. Copyright (c) 2006-2007, Kyle Smith
  18. All rights reserved.
  19.  
  20. Redistribution and use in source and binary forms, with or without
  21. modification, are permitted provided that the following conditions are met:
  22.  
  23.     * Redistributions of source code must retain the above copyright notice,
  24.       this list of conditions and the following disclaimer.
  25.     * Redistributions in binary form must reproduce the above copyright
  26.       notice, this list of conditions and the following disclaimer in the
  27.       documentation and/or other materials provided with the distribution.
  28.     * Neither the name of the author nor the names of its contributors may be
  29.       used to endorse or promote products derived from this software without
  30.       specific prior written permission.
  31.  
  32. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  33. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  34. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  35. DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
  36. FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  37. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  38. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  39. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  40. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  41. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  42. --]]
  43.  
  44. -- ABNF from RFC 3629
  45. --
  46. -- UTF8-octets = *( UTF8-char )
  47. -- UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
  48. -- UTF8-1      = %x00-7F
  49. -- UTF8-2      = %xC2-DF UTF8-tail
  50. -- UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
  51. --               %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
  52. -- UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
  53. --               %xF4 %x80-8F 2( UTF8-tail )
  54. -- UTF8-tail   = %x80-BF
  55. --
  56.  
  57. -- returns the number of bytes used by the UTF-8 character at byte i in s
  58. -- also doubles as a UTF-8 character validator
  59. local function utf8charbytes (s, i)
  60.     -- argument defaults
  61.     i = i or 1
  62.  
  63.     -- argument checking
  64.     if type(s) ~= "string" then
  65.         error("bad argument #1 to 'utf8charbytes' (string expected, got ".. type(s).. ")")
  66.     end
  67.     if type(i) ~= "number" then
  68.         error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(i).. ")")
  69.     end
  70.  
  71.     local c = s:byte(i)
  72.  
  73.     -- determine bytes needed for character, based on RFC 3629
  74.     -- validate byte 1
  75.     if c > 0 and c <= 127 then
  76.         -- UTF8-1
  77.         return 1
  78.  
  79.     elseif c >= 194 and c <= 223 then
  80.         -- UTF8-2
  81.         local c2 = s:byte(i + 1)
  82.  
  83.         if not c2 then
  84.             error("UTF-8 string terminated early")
  85.         end
  86.  
  87.         -- validate byte 2
  88.         if c2 < 128 or c2 > 191 then
  89.             error("Invalid UTF-8 character")
  90.         end
  91.  
  92.         return 2
  93.  
  94.     elseif c >= 224 and c <= 239 then
  95.         -- UTF8-3
  96.         local c2 = s:byte(i + 1)
  97.         local c3 = s:byte(i + 2)
  98.  
  99.         if not c2 or not c3 then
  100.             error("UTF-8 string terminated early")
  101.         end
  102.  
  103.         -- validate byte 2
  104.         if c == 224 and (c2 < 160 or c2 > 191) then
  105.             error("Invalid UTF-8 character")
  106.         elseif c == 237 and (c2 < 128 or c2 > 159) then
  107.             error("Invalid UTF-8 character")
  108.         elseif c2 < 128 or c2 > 191 then
  109.             error("Invalid UTF-8 character")
  110.         end
  111.  
  112.         -- validate byte 3
  113.         if c3 < 128 or c3 > 191 then
  114.             error("Invalid UTF-8 character")
  115.         end
  116.  
  117.         return 3
  118.  
  119.     elseif c >= 240 and c <= 244 then
  120.         -- UTF8-4
  121.         local c2 = s:byte(i + 1)
  122.         local c3 = s:byte(i + 2)
  123.         local c4 = s:byte(i + 3)
  124.  
  125.         if not c2 or not c3 or not c4 then
  126.             error("UTF-8 string terminated early")
  127.         end
  128.  
  129.         -- validate byte 2
  130.         if c == 240 and (c2 < 144 or c2 > 191) then
  131.             error("Invalid UTF-8 character")
  132.         elseif c == 244 and (c2 < 128 or c2 > 143) then
  133.             error("Invalid UTF-8 character")
  134.         elseif c2 < 128 or c2 > 191 then
  135.             error("Invalid UTF-8 character")
  136.         end
  137.        
  138.         -- validate byte 3
  139.         if c3 < 128 or c3 > 191 then
  140.             error("Invalid UTF-8 character")
  141.         end
  142.  
  143.         -- validate byte 4
  144.         if c4 < 128 or c4 > 191 then
  145.             error("Invalid UTF-8 character")
  146.         end
  147.  
  148.         return 4
  149.  
  150.     else
  151.         error("Invalid UTF-8 character")
  152.     end
  153. end
  154.  
  155.  
  156. -- returns the number of characters in a UTF-8 string
  157. local function utf8len (s)
  158.     -- argument checking
  159.     if type(s) ~= "string" then
  160.         error("bad argument #1 to 'utf8len' (string expected, got ".. type(s).. ")")
  161.     end
  162.  
  163.     local pos = 1
  164.     local bytes = s:len()
  165.     local len = 0
  166.  
  167.     while pos <= bytes do
  168.         len = len + 1
  169.         pos = pos + utf8charbytes(s, pos)
  170.     end
  171.  
  172.     return len
  173. end
  174.  
  175. -- install in the string library
  176. if not string.utf8len then
  177.     string.utf8len = utf8len
  178. end
  179.  
  180.  
  181. -- functions identically to string.sub except that i and j are UTF-8 characters
  182. -- instead of bytes
  183. local function utf8sub (s, i, j)
  184.     -- argument defaults
  185.     j = j or -1
  186.  
  187.     -- argument checking
  188.     if type(s) ~= "string" then
  189.         error("bad argument #1 to 'utf8sub' (string expected, got ".. type(s).. ")")
  190.     end
  191.     if type(i) ~= "number" then
  192.         error("bad argument #2 to 'utf8sub' (number expected, got ".. type(i).. ")")
  193.     end
  194.     if type(j) ~= "number" then
  195.         error("bad argument #3 to 'utf8sub' (number expected, got ".. type(j).. ")")
  196.     end
  197.  
  198.     local pos = 1
  199.     local bytes = s:len()
  200.     local len = 0
  201.  
  202.     -- only set l if i or j is negative
  203.     local l = (i >= 0 and j >= 0) or s:utf8len()
  204.     local startChar = (i >= 0) and i or l + i + 1
  205.     local endChar   = (j >= 0) and j or l + j + 1
  206.  
  207.     -- can't have start before end!
  208.     if startChar > endChar then
  209.         return ""
  210.     end
  211.  
  212.     -- byte offsets to pass to string.sub
  213.     local startByte, endByte = 1, bytes
  214.  
  215.     while pos <= bytes do
  216.         len = len + 1
  217.  
  218.         if len == startChar then
  219.             startByte = pos
  220.         end
  221.  
  222.         pos = pos + utf8charbytes(s, pos)
  223.  
  224.         if len == endChar then
  225.             endByte = pos - 1
  226.             break
  227.         end
  228.     end
  229.  
  230.     return s:sub(startByte, endByte)
  231. end
  232.  
  233. -- install in the string library
  234. if not string.utf8sub then
  235.     string.utf8sub = utf8sub
  236. end
  237.  
  238.  
  239. -- replace UTF-8 characters based on a mapping table
  240. local function utf8replace (s, mapping)
  241.     -- argument checking
  242.     if type(s) ~= "string" then
  243.         error("bad argument #1 to 'utf8replace' (string expected, got ".. type(s).. ")")
  244.     end
  245.     if type(mapping) ~= "table" then
  246.         error("bad argument #2 to 'utf8replace' (table expected, got ".. type(mapping).. ")")
  247.     end
  248.  
  249.     local pos = 1
  250.     local bytes = s:len()
  251.     local charbytes
  252.     local newstr = ""
  253.  
  254.     while pos <= bytes do
  255.         charbytes = utf8charbytes(s, pos)
  256.         local c = s:sub(pos, pos + charbytes - 1)
  257.  
  258.         newstr = newstr .. (mapping[c] or c)
  259.  
  260.         pos = pos + charbytes
  261.     end
  262.  
  263.     return newstr
  264. end
  265.  
  266.  
  267. -- identical to string.upper except it knows about unicode simple case conversions
  268. local function utf8upper (s)
  269.     return utf8replace(s, utf8_lc_uc)
  270. end
  271.  
  272. -- install in the string library
  273. if not string.utf8upper and utf8_lc_uc then
  274.     string.utf8upper = utf8upper
  275. end
  276.  
  277.  
  278. -- identical to string.lower except it knows about unicode simple case conversions
  279. local function utf8lower (s)
  280.     return utf8replace(s, utf8_uc_lc)
  281. end
  282.  
  283. -- install in the string library
  284. if not string.utf8lower and utf8_uc_lc then
  285.     string.utf8lower = utf8lower
  286. end
  287.  
  288.  
  289. -- identical to string.reverse except that it supports UTF-8
  290. local function utf8reverse (s)
  291.     -- argument checking
  292.     if type(s) ~= "string" then
  293.         error("bad argument #1 to 'utf8reverse' (string expected, got ".. type(s).. ")")
  294.     end
  295.  
  296.     local bytes = s:len()
  297.     local pos = bytes
  298.     local charbytes
  299.     local newstr = ""
  300.  
  301.     while pos > 0 do
  302.         c = s:byte(pos)
  303.         while c >= 128 and c <= 191 do
  304.             pos = pos - 1
  305.             c = s:byte(pos)
  306.         end
  307.  
  308.         charbytes = utf8charbytes(s, pos)
  309.  
  310.         newstr = newstr .. s:sub(pos, pos + charbytes - 1)
  311.  
  312.         pos = pos - 1
  313.     end
  314.  
  315.     return newstr
  316. end
  317.  
  318. -- install in the string library
  319. if not string.utf8reverse then
  320.     string.utf8reverse = utf8reverse
  321. end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement