Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // Counting Unicode codepoints in UTF-8:
- #include <stdio.h>
- int utf8_strlen(const char *s) {
- int count = 0;
- while (*s) {
- if ((*s & 0xC0) != 0x80) // Continuation bytes start with 10xxxxxx
- count++;
- s++;
- }
- return count;
- }
- // Indexing (getting the N-th Unicode character):
- const char* utf8_nth(const char *s, int n) {
- int i = 0;
- while (*s) {
- if ((*s & 0xC0) != 0x80) {
- if (i == n)
- return s;
- i++;
- }
- s++;
- }
- return NULL; // Out of range
- }
- // Decoding a single UTF-8 codepoint:
- #include <stdint.h>
- uint32_t utf8_decode(const char *s, int *bytes) {
- uint32_t cp = 0;
- unsigned char c = s[0];
- if (c < 0x80) {
- *bytes = 1;
- return c;
- } else if ((c & 0xE0) == 0xC0) {
- *bytes = 2;
- cp = c & 0x1F;
- } else if ((c & 0xF0) == 0xE0) {
- *bytes = 3;
- cp = c & 0x0F;
- } else if ((c & 0xF8) == 0xF0) {
- *bytes = 4;
- cp = c & 0x07;
- }
- for (int i = 1; i < *bytes; i++) {
- cp = (cp << 6) | (s[i] & 0x3F);
- }
- return cp;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement