View difference between Paste ID: QPBERWQY and eJtW9VDG
SHOW: | | - or go back to the newest paste.
1
function isUTF8ExtensionChar(char) {
2
    return (char >= "\200" && char < "\277")
3
}
4
5
# Extract a substring from a UTF-8 encoded string. This is required
6
# since not all versions of awk respect the encoding specified by
7
# $LANG. Of particular interest for me is the default busybox awk
8
# within Alpine linux.
9
function substrUTF8(str, start, len, inLen, subLen, inIndex, subIndex, outLen) {
10
    # Length of input string
11
    inLen = length(str)
12
13
    # Current index into input string
14
    inIndex = 1
15
16
    # Skip the initial unicode characters to get to starting index
17
    while (subIndex < start && inIndex <= inLen) {
18
        if (!isUTF8ExtensionChar(substr(str, inIndex, 1))) {
19
            inIndex++
20
            subIndex++
21
            while (isUTF8ExtensionChar(substr(str, inIndex, 1))) {
22
                inIndex++
23
            }
24
        }
25
    }
26
27
    # Length of substring of input string which will produce the
28
    # output string
29
    subLen = 0
30
31
    # Starting index of input string which corresponds to the first
32
    # character of the output string
33
    subIndex = 1
34
35
    # Number of true unicode characters counted for output string
36
    outLen = 0
37
38
    # Calculate the end point of the sub string
39
    while (outLen < len && inIndex <= inLen) {
40
        if (!isUTF8ExtensionChar(substr(str, inIndex, 1))) {
41
            inIndex++
42
            subLen++
43
            outLen++
44
            while (isUTF8ExtensionChar(substr(str, inIndex, 1))) {
45
                inIndex++
46
                subLen++
47
            }
48
        }
49
    }
50
51
    return substr(str, subIndex, subLen);
52
}