Advertisement
aaaaaa123456789

UTF-8 checking functions

Aug 5th, 2014
227
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. int check_utf8 (const char * string) {
  2.   // if you're trying to read and understand the UTF-8 functions... I'm sorry
  3.   int codepoint;
  4.   for (; *string; string ++) {
  5.     if (*string < 0x80) continue;
  6.     if (*string < 0xc2) return 0;
  7.     if (*string < 0xe0) {
  8.       if (string[1] < 0x80) return 0;
  9.       string ++;
  10.       continue;
  11.     }
  12.     if (*string < 0xf0) {
  13.       codepoint = get_utf8_codepoint((const unsigned char *) string + 1, *string, 2);
  14.       if (codepoint < 0x800) return 0;
  15.       if ((codepoint >= 0xd800) && (codepoint < 0xe000)) return 0;
  16.       string += 2;
  17.       continue;
  18.     }
  19.     if (*string < 0xf5) {
  20.       codepoint = get_utf8_codepoint((const unsigned char *) string + 1, *string, 3);
  21.       if (codepoint < 0x10000) return 0;
  22.       if (codepoint > 0x10ffff) return 0;
  23.       string += 3;
  24.       continue;
  25.     }
  26.     return 0;
  27.   }
  28.   return 1;
  29. }
  30.  
  31. int get_utf8_codepoint (const unsigned char * continuations, unsigned char initial, unsigned char readahead) {
  32.   if (readahead > 6) return -1;
  33.   if (!readahead) return initial;
  34.   unsigned result = initial & ((1 << (6 - readahead)) - 1);
  35.   for (; readahead --; continuations ++) {
  36.     if ((*continuations < 0x80) || (*continuations >= 0xc0)) return -1;
  37.     result <<= 6;
  38.     result |= *continuations & 0x3f;
  39.   }
  40.   return result;
  41. }
Advertisement
RAW Paste Data Copied
Advertisement