Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdlib.h>
- #include <iconv.h>
- #include <stdio.h>
- #define UTF8_ACCEPT 0
- #define UTF8_REJECT 1
- #include <stdint.h>
- static const uint8_t utf8d[] = {
- // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
- // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
- 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
- 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
- 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
- 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
- 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
- 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
- 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
- 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
- };
- uint32_t inline
- decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
- uint32_t type = utf8d[byte];
- *codep = (*state != UTF8_ACCEPT) ?
- (byte & 0x3fu) | (*codep << 6) :
- (0xff >> type) & (byte);
- *state = utf8d[256 + *state*16 + type];
- return *state;
- }
- iconv_t initialize_iconv() {
- iconv_t conv_desc;
- conv_desc = iconv_open ("UTF-32LE", "UTF-8");
- if ((int) conv_desc == -1) {
- printf("iconv initialization error\n");
- exit (1);
- }
- return conv_desc;
- }
- iconv_t conv = initialize_iconv();
- size_t decode1(const char s[5], unsigned out[8]) {
- char *in = const_cast<char*>(s);
- char *outBytes = reinterpret_cast<char*>(out);
- size_t nIn = 4;
- size_t nOut = 32;
- iconv(conv, 0,0,0,0); //reset
- size_t result = iconv(conv, &in, &nIn, &outBytes, &nOut);
- if (nIn != 0 || result != 0)
- return size_t(-1);
- //printf("xx:%d %d %d\n", (int)result, (int)nOut, (int)nIn);
- return 8 - nOut/4;
- }
- size_t decode2(const char s[4], unsigned out[8]) {
- uint32_t st = 0;
- uint32_t ch = 0;
- size_t iOut = 0;
- for (int i = 0; i < 4; ++i) {
- decode(&st, &ch, (unsigned char)s[i]);
- if (st == UTF8_REJECT)
- return -1;
- if (st == UTF8_ACCEPT)
- out[iOut++] = ch;
- }
- if (st != UTF8_ACCEPT)
- return -1;
- return iOut;
- }
- union SrcChars {
- char s[4];
- unsigned i;
- };
- void check_char(const char *src, size_t len, unsigned expected) {
- uint32_t st = 0;
- uint32_t ch = 0;
- for (int i = 0; i < len; ++i) {
- decode(&st, &ch, (unsigned char)src[i]);
- if (st == UTF8_REJECT) {
- if (expected != ~0) {
- printf("invalid char!\n");
- exit(1);
- } else {
- return; //ok
- }
- }
- if (st == UTF8_ACCEPT) {
- if (i != len-1) {
- printf("expected 1 char\n");
- exit(1);
- }
- }
- }
- if (st != UTF8_ACCEPT) {
- printf("partial char\n");
- exit(1);
- }
- if (ch != expected) {
- printf("unexpected decoding\n");
- exit(1);
- }
- }
- int main() {
- check_char("\xf0\x9f\x92\xa9", 4, 128169);
- check_char("a", 1, 'a');
- check_char("*", 1, '*');
- check_char("я", 2, 1103);
- check_char("\xff", 1, ~0); //basic overflow
- check_char("\xc0\x80", 2, ~0);
- check_char("\xc1\x80", 2, ~0); //overlong
- check_char("\xc2\x80", 2, 128);
- check_char("\xe2\x82\xac", 3, 0x20AC);
- check_char("\xf4\x8f\xbf\xbf", 4, 0x0010ffff); //last unicode char
- check_char("\xf4\x8f\xbf\xc0", 4, ~0); //first after last
- check_char("\xed\xa0\x80", 3, ~0);//surrogate start
- check_char("\xed\x9f\xbf", 3, 0xD7ff);//non-surrogate
- unsigned i = 0;
- for (;;) {
- if ((i & 0xFFFFFF) == 0)
- printf("progress: %d/%d\n", i>>24, 256);
- unsigned out1[8] = { 0 };
- unsigned out2[8] = { 0 };
- SrcChars c;
- c.i = i;
- //size_t size = decode2("аa", out);
- size_t size1 = decode1(c.s, out1);
- size_t size2 = decode2(c.s, out2);
- if (size1 != size2) {
- printf("%02x|%02x|%02x|%02x\n", (unsigned char)c.s[0],(unsigned char)c.s[1],(unsigned char)c.s[2],(unsigned char)c.s[3]);
- printf("size mismatch for %x (%d and %d)\n", i, (int)size1, (int)size2);
- return 1;
- }
- if (size1 != size_t(-1)) {
- for (size_t j = 0; j < size1; ++j) {
- if (out1[j] != out2[j]) {
- printf("%02x|%02x|%02x|%02x\n", (unsigned char)c.s[0],(unsigned char)c.s[1],(unsigned char)c.s[2],(unsigned char)c.s[3]);
- printf("output mismatch for %x\n", i);
- return 1;
- }
- }
- }
- ++i;
- if (i == 0)
- break;
- }
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement