olie305

Byte to Byte Val (Broken)

Jan 4th, 2018
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Lua 4.62 KB | None | 0 0
  1. -- Code starts at line 200 --
  2.  
  3. local byte = string.byte
  4. local len = string.len
  5. local sub = string.sub
  6.  
  7. local shift_6 = 2^6
  8. local shift_12 = 2^12
  9. local shift_18 = 2^18
  10.  
  11. local function utf8charbytes(s, i)
  12.   -- argument defaults
  13.   i = i or 1
  14.  
  15.   -- argument checking
  16.   if type(s) ~= "string" then
  17.     error("bad argument #1 to 'utf8charbytes' (string expected, got ".. type(s).. ")")
  18.   end
  19.   if type(i) ~= "number" then
  20.     error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(i).. ")")
  21.   end
  22.  
  23.   local c = byte(s, i)
  24.   --print(c)
  25.   -- determine bytes needed for character, based on RFC 3629
  26.   -- validate byte 1
  27.   if c >= 0 and c <= 127 then
  28.     -- UTF8-1
  29.     return 1
  30.  
  31.   elseif c >= 194 and c <= 223 then
  32.     -- UTF8-2
  33.     local c2 = byte(s, i + 1)
  34.  
  35.     if not c2 then
  36.       error("UTF-8 string terminated early")
  37.     end
  38.  
  39.     -- validate byte 2
  40.     if c2 < 128 or c2 > 191 then
  41.       error("Invalid UTF-8 character")
  42.     end
  43.  
  44.     return 2
  45.  
  46.   elseif c >= 224 and c <= 239 then
  47.     -- UTF8-3
  48.     local c2 = byte(s, i + 1)
  49.     local c3 = byte(s, i + 2)
  50.  
  51.     if not c2 or not c3 then
  52.       error("UTF-8 string terminated early")
  53.     end
  54.  
  55.     -- validate byte 2
  56.     if c == 224 and (c2 < 160 or c2 > 191) then
  57.       error("Invalid UTF-8 character")
  58.     elseif c == 237 and (c2 < 128 or c2 > 159) then
  59.       error("Invalid UTF-8 character")
  60.     elseif c2 < 128 or c2 > 191 then
  61.       error("Invalid UTF-8 character")
  62.     end
  63.  
  64.     -- validate byte 3
  65.     if c3 < 128 or c3 > 191 then
  66.       error("Invalid UTF-8 character")
  67.     end
  68.  
  69.     return 3
  70.  
  71.   elseif c >= 240 and c <= 244 then
  72.     -- UTF8-4
  73.     local c2 = byte(s, i + 1)
  74.     local c3 = byte(s, i + 2)
  75.     local c4 = byte(s, i + 3)
  76.  
  77.     if not c2 or not c3 or not c4 then
  78.       error("UTF-8 string terminated early")
  79.     end
  80.  
  81.     -- validate byte 2
  82.     if c == 240 and (c2 < 144 or c2 > 191) then
  83.       error("Invalid UTF-8 character2.1")
  84.     elseif c == 244 and (c2 < 128 or c2 > 143) then
  85.       error("Invalid UTF-8 character2.2")
  86.     elseif c2 < 128 or c2 > 191 then
  87.       error("Invalid UTF-8 character2.3")
  88.     end
  89.  
  90.     -- validate byte 3
  91.     if c3 < 128 or c3 > 191 then
  92.       error("Invalid UTF-8 character3")
  93.     end
  94.  
  95.     -- validate byte 4
  96.     if c4 < 128 or c4 > 191 then
  97.       error("Invalid UTF-8 character4")
  98.     end
  99.  
  100.     return 4
  101.  
  102.   else
  103.     error("Invalid UTF-8 character5")
  104.   end
  105. end
  106.  
  107.  
  108. local function utf8sub(s, i, j)
  109.   -- argument defaults
  110.   j = j or -1
  111.  
  112.   local pos = 1
  113.   local bytes = len(s)
  114.   local length = 0
  115.  
  116.   -- only set l if i or j is negative
  117.   local l = (i >= 0 and j >= 0) or utf8len(s)
  118.   local startChar = (i >= 0) and i or l + i + 1
  119.   local endChar   = (j >= 0) and j or l + j + 1
  120.  
  121.   -- can't have start before end!
  122.   if startChar > endChar then
  123.     return ""
  124.   end
  125.  
  126.   -- byte offsets to pass to string.sub
  127.   local startByte,endByte = 1,bytes
  128.  
  129.   while pos <= bytes do
  130.     length = length + 1
  131.  
  132.     if length == startChar then
  133.       startByte = pos
  134.     end
  135.  
  136.     pos = pos + utf8charbytes(s, pos)
  137.  
  138.     if length == endChar then
  139.       endByte = pos - 1
  140.       break
  141.     end
  142.   end
  143.  
  144.   if startChar > length then startByte = bytes+1   end
  145.   if endChar   < 1      then endByte   = 0         end
  146.  
  147.   return sub(s, startByte, endByte)
  148. end
  149.  
  150. local function utf8unicode(str, i, j, byte_pos)
  151.   i = i or 1
  152.   j = j or i
  153.  
  154.   if i > j then return end
  155.  
  156.   local ch,bytes
  157.  
  158.   if byte_pos then
  159.     bytes = utf8charbytes(str,byte_pos)
  160.     ch  = sub(str,byte_pos,byte_pos-1+bytes)
  161.   else
  162.     ch,byte_pos = utf8sub(str,i,i), 0
  163.     bytes       = #ch
  164.   end
  165.  
  166.   local unicode
  167.  
  168.   if bytes == 1 then unicode = byte(ch) end
  169.   if bytes == 2 then
  170.     local byte0,byte1 = byte(ch,1,2)
  171.     local code0,code1 = byte0-0xC0,byte1-0x80
  172.     unicode = code0*shift_6 + code1
  173.   end
  174.   if bytes == 3 then
  175.     local byte0,byte1,byte2 = byte(ch,1,3)
  176.     local code0,code1,code2 = byte0-0xE0,byte1-0x80,byte2-0x80
  177.     unicode = code0*shift_12 + code1*shift_6 + code2
  178.   end
  179.   if bytes == 4 then
  180.     local byte0,byte1,byte2,byte3 = byte(ch,1,4)
  181.     local code0,code1,code2,code3 = byte0-0xF0,byte1-0x80,byte2-0x80,byte3-0x80
  182.     unicode = code0*shift_18 + code1*shift_12 + code2*shift_6 + code3
  183.   end
  184.  
  185.   return unicode,bytes
  186. end
  187.  
  188. local function hexy(IN)
  189.     local B,K,O,I,D=16,"0123456789ABCDEF","",0
  190.     while IN>0 do
  191.       I=I+1
  192.       IN,D=math.floor(IN/B),math.fmod(IN,B)+1
  193.       O=string.sub(K,D,D)..O
  194.     end
  195.     return O
  196.   end
  197.  
  198. -- Where the code starts --
  199.  
  200. for i=1,9 do
  201. A,B = utf8unicode("!— ! ÿª #",i)
  202. --   [byte |    hex val    | char size]
  203. print(A.." | "..hexy(A).." | "..B)
  204. end
Advertisement
Add Comment
Please, Sign In to add comment