Advertisement
Guest User

Untitled

a guest
Mar 21st, 2013
421
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 5.24 KB | None | 0 0
  1.  
  2. #include <stdlib.h>
  3. #include <iconv.h>
  4. #include <stdio.h>
  5.  
  6. #define UTF8_ACCEPT 0
  7. #define UTF8_REJECT 1
  8. #include <stdint.h>
  9.  
  10. static const uint8_t utf8d[] = {
  11. // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
  12. // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
  13.   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
  14.   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
  15.   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
  16.   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
  17.   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
  18.   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
  19.   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
  20.   0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
  21.   0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
  22.   0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
  23.   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
  24.   1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
  25.   1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
  26.   1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
  27. };
  28.  
  29. uint32_t inline
  30. decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
  31.   uint32_t type = utf8d[byte];
  32.  
  33.   *codep = (*state != UTF8_ACCEPT) ?
  34.     (byte & 0x3fu) | (*codep << 6) :
  35.     (0xff >> type) & (byte);
  36.  
  37.   *state = utf8d[256 + *state*16 + type];
  38.   return *state;
  39. }
  40.  
  41.  
  42.  
  43. iconv_t initialize_iconv() {
  44.     iconv_t conv_desc;
  45.     conv_desc = iconv_open ("UTF-32LE", "UTF-8");
  46.     if ((int) conv_desc == -1) {
  47.         printf("iconv initialization error\n");
  48.         exit (1);
  49.     }
  50.     return conv_desc;
  51. }
  52.  
  53. iconv_t conv = initialize_iconv();
  54.  
  55. size_t decode1(const char s[5], unsigned out[8]) {
  56.     char *in = const_cast<char*>(s);
  57.     char *outBytes = reinterpret_cast<char*>(out);
  58.     size_t nIn = 4;
  59.     size_t nOut = 32;
  60.     iconv(conv, 0,0,0,0); //reset
  61.     size_t result = iconv(conv, &in, &nIn, &outBytes, &nOut);
  62.     if (nIn != 0 || result != 0)
  63.         return size_t(-1);
  64.     //printf("xx:%d %d %d\n", (int)result, (int)nOut, (int)nIn);
  65.     return 8 - nOut/4;
  66. }
  67.  
  68. size_t decode2(const char s[4], unsigned out[8]) {
  69.     uint32_t st = 0;
  70.     uint32_t ch = 0;
  71.     size_t iOut = 0;
  72.     for (int i = 0; i < 4; ++i) {
  73.        decode(&st, &ch, (unsigned char)s[i]);
  74.        if (st == UTF8_REJECT)
  75.            return -1;
  76.        if (st == UTF8_ACCEPT)
  77.            out[iOut++] = ch;
  78.     }
  79.     if (st != UTF8_ACCEPT)
  80.         return -1;
  81.     return iOut;
  82. }
  83.  
  84. union SrcChars {
  85.     char s[4];
  86.     unsigned i;
  87. };
  88.  
  89. void check_char(const char *src, size_t len, unsigned expected) {
  90.     uint32_t st = 0;
  91.     uint32_t ch = 0;
  92.     for (int i = 0; i < len; ++i) {
  93.        decode(&st, &ch, (unsigned char)src[i]);
  94.        if (st == UTF8_REJECT) {
  95.            if (expected != ~0) {
  96.                printf("invalid char!\n");
  97.                exit(1);
  98.            } else {
  99.                return; //ok
  100.            }
  101.        }
  102.        if (st == UTF8_ACCEPT) {
  103.            if (i != len-1) {
  104.                printf("expected 1 char\n");
  105.                exit(1);
  106.            }
  107.        }
  108.     }
  109.     if (st != UTF8_ACCEPT) {
  110.        printf("partial char\n");
  111.        exit(1);
  112.     }
  113.     if (ch != expected) {
  114.        printf("unexpected decoding\n");
  115.        exit(1);
  116.     }
  117. }
  118.  
  119. int main() {
  120.     check_char("\xf0\x9f\x92\xa9", 4, 128169);
  121.     check_char("a", 1, 'a');
  122.     check_char("*", 1, '*');
  123.     check_char("я", 2, 1103);
  124.     check_char("\xff", 1, ~0); //basic overflow
  125.     check_char("\xc0\x80", 2, ~0);
  126.     check_char("\xc1\x80", 2, ~0); //overlong
  127.     check_char("\xc2\x80", 2, 128);
  128.     check_char("\xe2\x82\xac", 3, 0x20AC);
  129.     check_char("\xf4\x8f\xbf\xbf", 4, 0x0010ffff); //last unicode char
  130.     check_char("\xf4\x8f\xbf\xc0", 4, ~0); //first after last
  131.     check_char("\xed\xa0\x80", 3, ~0);//surrogate start
  132.     check_char("\xed\x9f\xbf", 3, 0xD7ff);//non-surrogate
  133.  
  134.     unsigned i = 0;
  135.     for (;;) {
  136.         if ((i & 0xFFFFFF) == 0)
  137.             printf("progress: %d/%d\n", i>>24, 256);
  138.         unsigned out1[8] = { 0 };
  139.         unsigned out2[8] = { 0 };
  140.         SrcChars c;
  141.         c.i = i;
  142.         //size_t size = decode2("аa", out);
  143.         size_t size1 = decode1(c.s, out1);
  144.         size_t size2 = decode2(c.s, out2);
  145.         if (size1 != size2) {
  146.             printf("%02x|%02x|%02x|%02x\n", (unsigned char)c.s[0],(unsigned char)c.s[1],(unsigned char)c.s[2],(unsigned char)c.s[3]);
  147.             printf("size mismatch for %x (%d and %d)\n", i, (int)size1, (int)size2);
  148.             return 1;
  149.         }
  150.         if (size1 != size_t(-1)) {
  151.             for (size_t j = 0; j < size1; ++j) {
  152.                 if (out1[j] != out2[j]) {
  153.                     printf("%02x|%02x|%02x|%02x\n", (unsigned char)c.s[0],(unsigned char)c.s[1],(unsigned char)c.s[2],(unsigned char)c.s[3]);
  154.                     printf("output mismatch for %x\n", i);
  155.                     return 1;
  156.                 }
  157.             }
  158.         }
  159.         ++i;
  160.         if (i == 0)
  161.             break;
  162.     }
  163.     return 0;
  164. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement