Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- -- Code starts at line 200 --
- local byte = string.byte
- local len = string.len
- local sub = string.sub
- local shift_6 = 2^6
- local shift_12 = 2^12
- local shift_18 = 2^18
- local function utf8charbytes(s, i)
- -- argument defaults
- i = i or 1
- -- argument checking
- if type(s) ~= "string" then
- error("bad argument #1 to 'utf8charbytes' (string expected, got ".. type(s).. ")")
- end
- if type(i) ~= "number" then
- error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(i).. ")")
- end
- local c = byte(s, i)
- --print(c)
- -- determine bytes needed for character, based on RFC 3629
- -- validate byte 1
- if c >= 0 and c <= 127 then
- -- UTF8-1
- return 1
- elseif c >= 194 and c <= 223 then
- -- UTF8-2
- local c2 = byte(s, i + 1)
- if not c2 then
- error("UTF-8 string terminated early")
- end
- -- validate byte 2
- if c2 < 128 or c2 > 191 then
- error("Invalid UTF-8 character")
- end
- return 2
- elseif c >= 224 and c <= 239 then
- -- UTF8-3
- local c2 = byte(s, i + 1)
- local c3 = byte(s, i + 2)
- if not c2 or not c3 then
- error("UTF-8 string terminated early")
- end
- -- validate byte 2
- if c == 224 and (c2 < 160 or c2 > 191) then
- error("Invalid UTF-8 character")
- elseif c == 237 and (c2 < 128 or c2 > 159) then
- error("Invalid UTF-8 character")
- elseif c2 < 128 or c2 > 191 then
- error("Invalid UTF-8 character")
- end
- -- validate byte 3
- if c3 < 128 or c3 > 191 then
- error("Invalid UTF-8 character")
- end
- return 3
- elseif c >= 240 and c <= 244 then
- -- UTF8-4
- local c2 = byte(s, i + 1)
- local c3 = byte(s, i + 2)
- local c4 = byte(s, i + 3)
- if not c2 or not c3 or not c4 then
- error("UTF-8 string terminated early")
- end
- -- validate byte 2
- if c == 240 and (c2 < 144 or c2 > 191) then
- error("Invalid UTF-8 character2.1")
- elseif c == 244 and (c2 < 128 or c2 > 143) then
- error("Invalid UTF-8 character2.2")
- elseif c2 < 128 or c2 > 191 then
- error("Invalid UTF-8 character2.3")
- end
- -- validate byte 3
- if c3 < 128 or c3 > 191 then
- error("Invalid UTF-8 character3")
- end
- -- validate byte 4
- if c4 < 128 or c4 > 191 then
- error("Invalid UTF-8 character4")
- end
- return 4
- else
- error("Invalid UTF-8 character5")
- end
- end
- local function utf8sub(s, i, j)
- -- argument defaults
- j = j or -1
- local pos = 1
- local bytes = len(s)
- local length = 0
- -- only set l if i or j is negative
- local l = (i >= 0 and j >= 0) or utf8len(s)
- local startChar = (i >= 0) and i or l + i + 1
- local endChar = (j >= 0) and j or l + j + 1
- -- can't have start before end!
- if startChar > endChar then
- return ""
- end
- -- byte offsets to pass to string.sub
- local startByte,endByte = 1,bytes
- while pos <= bytes do
- length = length + 1
- if length == startChar then
- startByte = pos
- end
- pos = pos + utf8charbytes(s, pos)
- if length == endChar then
- endByte = pos - 1
- break
- end
- end
- if startChar > length then startByte = bytes+1 end
- if endChar < 1 then endByte = 0 end
- return sub(s, startByte, endByte)
- end
- local function utf8unicode(str, i, j, byte_pos)
- i = i or 1
- j = j or i
- if i > j then return end
- local ch,bytes
- if byte_pos then
- bytes = utf8charbytes(str,byte_pos)
- ch = sub(str,byte_pos,byte_pos-1+bytes)
- else
- ch,byte_pos = utf8sub(str,i,i), 0
- bytes = #ch
- end
- local unicode
- if bytes == 1 then unicode = byte(ch) end
- if bytes == 2 then
- local byte0,byte1 = byte(ch,1,2)
- local code0,code1 = byte0-0xC0,byte1-0x80
- unicode = code0*shift_6 + code1
- end
- if bytes == 3 then
- local byte0,byte1,byte2 = byte(ch,1,3)
- local code0,code1,code2 = byte0-0xE0,byte1-0x80,byte2-0x80
- unicode = code0*shift_12 + code1*shift_6 + code2
- end
- if bytes == 4 then
- local byte0,byte1,byte2,byte3 = byte(ch,1,4)
- local code0,code1,code2,code3 = byte0-0xF0,byte1-0x80,byte2-0x80,byte3-0x80
- unicode = code0*shift_18 + code1*shift_12 + code2*shift_6 + code3
- end
- return unicode,bytes
- end
- local function hexy(IN)
- local B,K,O,I,D=16,"0123456789ABCDEF","",0
- while IN>0 do
- I=I+1
- IN,D=math.floor(IN/B),math.fmod(IN,B)+1
- O=string.sub(K,D,D)..O
- end
- return O
- end
- -- Where the code starts --
- for i=1,9 do
- A,B = utf8unicode("! ! ÿª #",i)
- -- [byte | hex val | char size]
- print(A.." | "..hexy(A).." | "..B)
- end
Advertisement
Add Comment
Please, Sign In to add comment