Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- if SERVER then
- AddCSLuaFile("utf8data.lua")
- end
- include("utf8data.lua")
- -- Copyright (c) 2006-2007, Kyle Smith
- -- returns the number of bytes used by the UTF-8 character at byte i in s
- -- also doubles as a UTF-8 character validator
- local function utf8charbytes(s, i)
- -- argument defaults
- i = i or 1
- -- argument checking
- if type(s) ~= "string" then
- error("bad argument #1 to 'utf8charbytes' (string expected, got ".. type(s).. ")")
- end
- if type(i) ~= "number" then
- error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(i).. ")")
- end
- local c = s:byte(i)
- -- determine bytes needed for character, based on RFC 3629
- -- validate byte 1
- if c > 0 and c <= 127 then
- -- UTF8-1
- return 1
- elseif c >= 194 and c <= 223 then
- -- UTF8-2
- local c2 = s:byte(i + 1)
- if not c2 then
- error("UTF-8 string terminated early")
- end
- -- validate byte 2
- if c2 < 128 or c2 > 191 then
- error("Invalid UTF-8 character")
- end
- return 2
- elseif c >= 224 and c <= 239 then
- -- UTF8-3
- local c2 = s:byte(i + 1)
- local c3 = s:byte(i + 2)
- if not c2 or not c3 then
- error("UTF-8 string terminated early")
- end
- -- validate byte 2
- if c == 224 and (c2 < 160 or c2 > 191) then
- error("Invalid UTF-8 character")
- elseif c == 237 and (c2 < 128 or c2 > 159) then
- error("Invalid UTF-8 character")
- elseif c2 < 128 or c2 > 191 then
- error("Invalid UTF-8 character")
- end
- -- validate byte 3
- if c3 < 128 or c3 > 191 then
- error("Invalid UTF-8 character")
- end
- return 3
- elseif c >= 240 and c <= 244 then
- -- UTF8-4
- local c2 = s:byte(i + 1)
- local c3 = s:byte(i + 2)
- local c4 = s:byte(i + 3)
- if not c2 or not c3 or not c4 then
- error("UTF-8 string terminated early")
- end
- -- validate byte 2
- if c == 240 and (c2 < 144 or c2 > 191) then
- error("Invalid UTF-8 character")
- elseif c == 244 and (c2 < 128 or c2 > 143) then
- error("Invalid UTF-8 character")
- elseif c2 < 128 or c2 > 191 then
- error("Invalid UTF-8 character")
- end
- -- validate byte 3
- if c3 < 128 or c3 > 191 then
- error("Invalid UTF-8 character")
- end
- -- validate byte 4
- if c4 < 128 or c4 > 191 then
- error("Invalid UTF-8 character")
- end
- return 4
- else
- error("Invalid UTF-8 character")
- end
- end
- -- returns the number of characters in a UTF-8 string
- local function utf8len(s)
- -- argument checking
- if type(s) ~= "string" then
- error("bad argument #1 to 'utf8len' (string expected, got ".. type(s).. ")")
- end
- local pos = 1
- local bytes = s:len()
- local len = 0
- while pos <= bytes do
- len = len + 1
- pos = pos + utf8charbytes(s, pos)
- end
- return len
- end
- string.utf8len = utf8len
- -- functions identically to string.sub except that i and j are UTF-8 characters
- -- instead of bytes
- local function utf8sub(s, i, j)
- -- argument defaults
- j = j or -1
- -- argument checking
- if type(s) ~= "string" then
- error("bad argument #1 to 'utf8sub' (string expected, got ".. type(s).. ")")
- end
- if type(i) ~= "number" then
- error("bad argument #2 to 'utf8sub' (number expected, got ".. type(i).. ")")
- end
- if type(j) ~= "number" then
- error("bad argument #3 to 'utf8sub' (number expected, got ".. type(j).. ")")
- end
- local pos = 1
- local bytes = s:len()
- local len = 0
- -- only set l if i or j is negative
- local l = (i >= 0 and j >= 0) or s:utf8len()
- local startChar = i >= 0 and i or l + i + 1
- local endChar = j >= 0 and j or l + j + 1
- -- can't have start before end!
- if startChar > endChar then
- return ""
- end
- -- byte offsets to pass to string.sub
- local startByte, endByte = 1, bytes
- while pos <= bytes do
- len = len + 1
- if len == startChar then
- startByte = pos
- end
- pos = pos + utf8charbytes(s, pos)
- if len == endChar then
- endByte = pos - 1
- break
- end
- end
- return s:sub(startByte, endByte)
- end
- string.utf8sub = utf8sub
- -- replace UTF-8 characters based on a mapping table
- local function utf8replace(s, mapping)
- -- argument checking
- if type(s) ~= "string" then
- error("bad argument #1 to 'utf8replace' (string expected, got ".. type(s).. ")")
- end
- if type(mapping) ~= "table" then
- error("bad argument #2 to 'utf8replace' (table expected, got ".. type(mapping).. ")")
- end
- local pos = 1
- local bytes = s:len()
- local charbytes
- local newstr = ""
- while pos <= bytes do
- charbytes = utf8charbytes(s, pos)
- local c = s:sub(pos, pos + charbytes - 1)
- newstr = newstr .. (mapping[c] or c)
- pos = pos + charbytes
- end
- return newstr
- end
- -- identical to string.upper except it knows about unicode simple case conversions
- local function utf8upper(s)
- return utf8replace(s, utf8_lc_uc)
- end
- if utf8_lc_uc then
- string.utf8upper = utf8upper
- end
- -- identical to string.lower except it knows about unicode simple case conversions
- local function utf8lower(s)
- return utf8replace(s, utf8_uc_lc)
- end
- if utf8_uc_lc then
- string.utf8lower = utf8lower
- end
- -- identical to string.reverse except that it supports UTF-8
- local function utf8reverse(s)
- -- argument checking
- if type(s) ~= "string" then
- error("bad argument #1 to 'utf8reverse' (string expected, got ".. type(s).. ")")
- end
- local bytes = s:len()
- local pos = bytes
- local charbytes
- local newstr = ""
- while pos > 0 do
- c = s:byte(pos)
- while c >= 128 and c <= 191 do
- pos = pos - 1
- c = s:byte(pos)
- end
- charbytes = utf8charbytes(s, pos)
- newstr = newstr .. s:sub(pos, pos + charbytes - 1)
- pos = pos - 1
- end
- return newstr
- end
- string.utf8reverse = utf8reverse
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement