Advertisement
Guest User

Untitled

a guest
Jun 18th, 2025
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 1.16 KB | Source Code | 0 0
  1. // Counting Unicode codepoints in UTF-8:
  2.  
  3. #include <stdio.h>
  4.  
  5. int utf8_strlen(const char *s) {
  6.     int count = 0;
  7.     while (*s) {
  8.         if ((*s & 0xC0) != 0x80) // Continuation bytes start with 10xxxxxx
  9.             count++;
  10.         s++;
  11.     }
  12.     return count;
  13. }
  14.  
  15. // Indexing (getting the N-th Unicode character):
  16.  
  17. const char* utf8_nth(const char *s, int n) {
  18.     int i = 0;
  19.     while (*s) {
  20.         if ((*s & 0xC0) != 0x80) {
  21.             if (i == n)
  22.                 return s;
  23.             i++;
  24.         }
  25.         s++;
  26.     }
  27.     return NULL; // Out of range
  28. }
  29.  
  30. // Decoding a single UTF-8 codepoint:
  31.  
  32. #include <stdint.h>
  33.  
  34. uint32_t utf8_decode(const char *s, int *bytes) {
  35.     uint32_t cp = 0;
  36.     unsigned char c = s[0];
  37.  
  38.     if (c < 0x80) {
  39.         *bytes = 1;
  40.         return c;
  41.     } else if ((c & 0xE0) == 0xC0) {
  42.         *bytes = 2;
  43.         cp = c & 0x1F;
  44.     } else if ((c & 0xF0) == 0xE0) {
  45.         *bytes = 3;
  46.         cp = c & 0x0F;
  47.     } else if ((c & 0xF8) == 0xF0) {
  48.         *bytes = 4;
  49.         cp = c & 0x07;
  50.     }
  51.  
  52.     for (int i = 1; i < *bytes; i++) {
  53.         cp = (cp << 6) | (s[i] & 0x3F);
  54.     }
  55.     return cp;
  56. }
  57.  
Tags: unicode
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement