Advertisement
Guest User

Untitled

a guest
Jan 13th, 2018
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Lua 5.56 KB | None | 0 0
  1. if SERVER then
  2.     AddCSLuaFile("utf8data.lua")
  3. end
  4. include("utf8data.lua")
  5.  
  6. -- Copyright (c) 2006-2007, Kyle Smith
  7.  
  8. -- returns the number of bytes used by the UTF-8 character at byte i in s
  9. -- also doubles as a UTF-8 character validator
  10. local function utf8charbytes(s, i)
  11.     -- argument defaults
  12.     i = i or 1
  13.  
  14.     -- argument checking
  15.     if type(s) ~= "string" then
  16.         error("bad argument #1 to 'utf8charbytes' (string expected, got ".. type(s).. ")")
  17.     end
  18.     if type(i) ~= "number" then
  19.         error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(i).. ")")
  20.     end
  21.  
  22.     local c = s:byte(i)
  23.  
  24.     -- determine bytes needed for character, based on RFC 3629
  25.     -- validate byte 1
  26.     if c > 0 and c <= 127 then
  27.         -- UTF8-1
  28.         return 1
  29.     elseif c >= 194 and c <= 223 then
  30.         -- UTF8-2
  31.         local c2 = s:byte(i + 1)
  32.  
  33.         if not c2 then
  34.             error("UTF-8 string terminated early")
  35.         end
  36.  
  37.         -- validate byte 2
  38.         if c2 < 128 or c2 > 191 then
  39.             error("Invalid UTF-8 character")
  40.         end
  41.  
  42.         return 2
  43.     elseif c >= 224 and c <= 239 then
  44.         -- UTF8-3
  45.         local c2 = s:byte(i + 1)
  46.         local c3 = s:byte(i + 2)
  47.  
  48.         if not c2 or not c3 then
  49.             error("UTF-8 string terminated early")
  50.         end
  51.  
  52.         -- validate byte 2
  53.         if c == 224 and (c2 < 160 or c2 > 191) then
  54.             error("Invalid UTF-8 character")
  55.         elseif c == 237 and (c2 < 128 or c2 > 159) then
  56.             error("Invalid UTF-8 character")
  57.         elseif c2 < 128 or c2 > 191 then
  58.             error("Invalid UTF-8 character")
  59.         end
  60.  
  61.         -- validate byte 3
  62.         if c3 < 128 or c3 > 191 then
  63.             error("Invalid UTF-8 character")
  64.         end
  65.  
  66.         return 3
  67.     elseif c >= 240 and c <= 244 then
  68.         -- UTF8-4
  69.         local c2 = s:byte(i + 1)
  70.         local c3 = s:byte(i + 2)
  71.         local c4 = s:byte(i + 3)
  72.  
  73.         if not c2 or not c3 or not c4 then
  74.             error("UTF-8 string terminated early")
  75.         end
  76.  
  77.         -- validate byte 2
  78.         if c == 240 and (c2 < 144 or c2 > 191) then
  79.             error("Invalid UTF-8 character")
  80.         elseif c == 244 and (c2 < 128 or c2 > 143) then
  81.             error("Invalid UTF-8 character")
  82.         elseif c2 < 128 or c2 > 191 then
  83.             error("Invalid UTF-8 character")
  84.         end
  85.        
  86.         -- validate byte 3
  87.         if c3 < 128 or c3 > 191 then
  88.             error("Invalid UTF-8 character")
  89.         end
  90.  
  91.         -- validate byte 4
  92.         if c4 < 128 or c4 > 191 then
  93.             error("Invalid UTF-8 character")
  94.         end
  95.  
  96.         return 4
  97.     else
  98.         error("Invalid UTF-8 character")
  99.     end
  100. end
  101.  
  102. -- returns the number of characters in a UTF-8 string
  103. local function utf8len(s)
  104.     -- argument checking
  105.     if type(s) ~= "string" then
  106.         error("bad argument #1 to 'utf8len' (string expected, got ".. type(s).. ")")
  107.     end
  108.  
  109.     local pos = 1
  110.     local bytes = s:len()
  111.     local len = 0
  112.  
  113.     while pos <= bytes do
  114.         len = len + 1
  115.         pos = pos + utf8charbytes(s, pos)
  116.     end
  117.  
  118.     return len
  119. end
  120. string.utf8len = utf8len
  121.  
  122. -- functions identically to string.sub except that i and j are UTF-8 characters
  123. -- instead of bytes
  124. local function utf8sub(s, i, j)
  125.     -- argument defaults
  126.     j = j or -1
  127.  
  128.     -- argument checking
  129.     if type(s) ~= "string" then
  130.         error("bad argument #1 to 'utf8sub' (string expected, got ".. type(s).. ")")
  131.     end
  132.     if type(i) ~= "number" then
  133.         error("bad argument #2 to 'utf8sub' (number expected, got ".. type(i).. ")")
  134.     end
  135.     if type(j) ~= "number" then
  136.         error("bad argument #3 to 'utf8sub' (number expected, got ".. type(j).. ")")
  137.     end
  138.  
  139.     local pos = 1
  140.     local bytes = s:len()
  141.     local len = 0
  142.  
  143.     -- only set l if i or j is negative
  144.     local l = (i >= 0 and j >= 0) or s:utf8len()
  145.     local startChar = i >= 0 and i or l + i + 1
  146.     local endChar   = j >= 0 and j or l + j + 1
  147.  
  148.     -- can't have start before end!
  149.     if startChar > endChar then
  150.         return ""
  151.     end
  152.  
  153.     -- byte offsets to pass to string.sub
  154.     local startByte, endByte = 1, bytes
  155.  
  156.     while pos <= bytes do
  157.         len = len + 1
  158.  
  159.         if len == startChar then
  160.             startByte = pos
  161.         end
  162.  
  163.         pos = pos + utf8charbytes(s, pos)
  164.  
  165.         if len == endChar then
  166.             endByte = pos - 1
  167.             break
  168.         end
  169.     end
  170.  
  171.     return s:sub(startByte, endByte)
  172. end
  173. string.utf8sub = utf8sub
  174.  
  175. -- replace UTF-8 characters based on a mapping table
  176. local function utf8replace(s, mapping)
  177.     -- argument checking
  178.     if type(s) ~= "string" then
  179.         error("bad argument #1 to 'utf8replace' (string expected, got ".. type(s).. ")")
  180.     end
  181.     if type(mapping) ~= "table" then
  182.         error("bad argument #2 to 'utf8replace' (table expected, got ".. type(mapping).. ")")
  183.     end
  184.  
  185.     local pos = 1
  186.     local bytes = s:len()
  187.     local charbytes
  188.     local newstr = ""
  189.  
  190.     while pos <= bytes do
  191.         charbytes = utf8charbytes(s, pos)
  192.         local c = s:sub(pos, pos + charbytes - 1)
  193.  
  194.         newstr = newstr .. (mapping[c] or c)
  195.  
  196.         pos = pos + charbytes
  197.     end
  198.  
  199.     return newstr
  200. end
  201.  
  202. -- identical to string.upper except it knows about unicode simple case conversions
  203. local function utf8upper(s)
  204.     return utf8replace(s, utf8_lc_uc)
  205. end
  206. if utf8_lc_uc then
  207.     string.utf8upper = utf8upper
  208. end
  209.  
  210. -- identical to string.lower except it knows about unicode simple case conversions
  211. local function utf8lower(s)
  212.     return utf8replace(s, utf8_uc_lc)
  213. end
  214. if utf8_uc_lc then
  215.     string.utf8lower = utf8lower
  216. end
  217.  
  218. -- identical to string.reverse except that it supports UTF-8
  219. local function utf8reverse(s)
  220.     -- argument checking
  221.     if type(s) ~= "string" then
  222.         error("bad argument #1 to 'utf8reverse' (string expected, got ".. type(s).. ")")
  223.     end
  224.  
  225.     local bytes = s:len()
  226.     local pos = bytes
  227.     local charbytes
  228.     local newstr = ""
  229.  
  230.     while pos > 0 do
  231.         c = s:byte(pos)
  232.         while c >= 128 and c <= 191 do
  233.             pos = pos - 1
  234.             c = s:byte(pos)
  235.         end
  236.  
  237.         charbytes = utf8charbytes(s, pos)
  238.  
  239.         newstr = newstr .. s:sub(pos, pos + charbytes - 1)
  240.  
  241.         pos = pos - 1
  242.     end
  243.  
  244.     return newstr
  245. end
  246. string.utf8reverse = utf8reverse
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement