Untitled

function isUTF8ExtensionChar(char) {
    return (char >= "\200" && char < "\277")
}

# Extract a substring from a UTF-8 encoded string. This is required
# since not all versions of awk respect the encoding specified by
# $LANG. Of particular interest for me is the default busybox awk
# within Alpine linux.
function substrUTF8(str, start, len, inLen, subLen, inIndex, subIndex, outLen) {
    # Length of input string
    inLen = length(str)

    # Current index into input string
    inIndex = 1

    # Skip the initial unicode characters to get to starting index
    while (subIndex < start && inIndex <= inLen) {
        if (!isUTF8ExtensionChar(substr(str, inIndex, 1))) {
            inIndex++
            subIndex++
            while (isUTF8ExtensionChar(substr(str, inIndex, 1))) {
                inIndex++
            }
        }
    }

    # Length of substring of input string which will produce the
    # output string
    subLen = 0

    # Starting index of input string which corresponds to the first
    # character of the output string
    subIndex = 1

    # Number of true unicode characters counted for output string
    outLen = 0

    # Calculate the end point of the sub string
    while (outLen < len && inIndex <= inLen) {
        if (!isUTF8ExtensionChar(substr(str, inIndex, 1))) {
            inIndex++
            subLen++
            outLen++
            while (isUTF8ExtensionChar(substr(str, inIndex, 1))) {
                inIndex++
                subLen++
            }
        }
    }

    return substr(str, subIndex, subLen);
}