SHOW:
|
|
- or go back to the newest paste.
1 | function isUTF8ExtensionChar(char) { | |
2 | return (char >= "\200" && char < "\277") | |
3 | } | |
4 | ||
5 | # Extract a substring from a UTF-8 encoded string. This is required | |
6 | # since not all versions of awk respect the encoding specified by | |
7 | # $LANG. Of particular interest for me is the default busybox awk | |
8 | # within Alpine linux. | |
9 | function substrUTF8(str, start, len, inLen, subLen, inIndex, subIndex, outLen) { | |
10 | # Length of input string | |
11 | inLen = length(str) | |
12 | ||
13 | # Current index into input string | |
14 | inIndex = 1 | |
15 | ||
16 | # Skip the initial unicode characters to get to starting index | |
17 | while (subIndex < start && inIndex <= inLen) { | |
18 | if (!isUTF8ExtensionChar(substr(str, inIndex, 1))) { | |
19 | inIndex++ | |
20 | subIndex++ | |
21 | while (isUTF8ExtensionChar(substr(str, inIndex, 1))) { | |
22 | inIndex++ | |
23 | } | |
24 | } | |
25 | } | |
26 | ||
27 | # Length of substring of input string which will produce the | |
28 | # output string | |
29 | subLen = 0 | |
30 | ||
31 | # Starting index of input string which corresponds to the first | |
32 | # character of the output string | |
33 | subIndex = 1 | |
34 | ||
35 | # Number of true unicode characters counted for output string | |
36 | outLen = 0 | |
37 | ||
38 | # Calculate the end point of the sub string | |
39 | while (outLen < len && inIndex <= inLen) { | |
40 | if (!isUTF8ExtensionChar(substr(str, inIndex, 1))) { | |
41 | inIndex++ | |
42 | subLen++ | |
43 | outLen++ | |
44 | while (isUTF8ExtensionChar(substr(str, inIndex, 1))) { | |
45 | inIndex++ | |
46 | subLen++ | |
47 | } | |
48 | } | |
49 | } | |
50 | ||
51 | return substr(str, subIndex, subLen); | |
52 | } |