Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- function isUTF8ExtensionChar(char) {
- return (char >= "\200" && char < "\277")
- }
- # Extract a substring from a UTF-8 encoded string. This is required
- # since not all versions of awk respect the encoding specified by
- # $LANG. Of particular interest for me is the default busybox awk
- # within Alpine linux.
- function substrUTF8(str, start, len, inLen, subLen, inIndex, subIndex, outLen) {
- # Length of input string
- inLen = length(str)
- # Current index into input string
- inIndex = 1
- # Skip the initial unicode characters to get to starting index
- while (subIndex < start && inIndex <= inLen) {
- if (!isUTF8ExtensionChar(substr(str, inIndex, 1))) {
- inIndex++
- subIndex++
- while (isUTF8ExtensionChar(substr(str, inIndex, 1))) {
- inIndex++
- }
- }
- }
- # Length of substring of input string which will produce the
- # output string
- subLen = 0
- # Starting index of input string which corresponds to the first
- # character of the output string
- subIndex = 1
- # Number of true unicode characters counted for output string
- outLen = 0
- # Calculate the end point of the sub string
- while (outLen < len && inIndex <= inLen) {
- if (!isUTF8ExtensionChar(substr(str, inIndex, 1))) {
- inIndex++
- subLen++
- outLen++
- while (isUTF8ExtensionChar(substr(str, inIndex, 1))) {
- inIndex++
- subLen++
- }
- }
- }
- return substr(str, subIndex, subLen);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement