View difference between Paste ID: NFJUP0R5 and
SHOW:
|
|
- or go back to the newest paste.
1 | - | |
1 | + | #include <cstdio> |
2 | #include <iostream> | |
3 | // #include <cstdlib> | |
4 | ||
5 | const int BLOCK_SIZE = 65536; | |
6 | ||
7 | ||
8 | #ifdef DEBUG | |
9 | #define CHECKPOINT { std::cerr << "CHECKPOINT: " << __FILE__ << " (" << __LINE__ << ")" << std::endl; } | |
10 | #define DEBUGPRINT(x) { std::cerr << "DEBUG: " << __FILE__ << " (" << __LINE__ << "): " << #x << "= " << x << std::endl; } | |
11 | #else | |
12 | #define CHECKPOINT | |
13 | #define DEBUGPRINT(x) | |
14 | #endif | |
15 | ||
16 | class UTF8Decoder | |
17 | { | |
18 | public: | |
19 | enum State { | |
20 | ERROR = -1 | |
21 | , DEFAULT | |
22 | , WAITING1 | |
23 | , WAITING2 | |
24 | , WAITING3 | |
25 | }; | |
26 | UTF8Decoder() | |
27 | : state(DEFAULT) | |
28 | , codePosition(0) | |
29 | , count(0) | |
30 | { } | |
31 | State Update(unsigned char ch) | |
32 | { | |
33 | DEBUGPRINT((unsigned int)ch); | |
34 | DEBUGPRINT(state); | |
35 | if(state==ERROR) | |
36 | return state; | |
37 | if(~ch & 0x80) | |
38 | { | |
39 | if(state==DEFAULT) | |
40 | { | |
41 | ++count; | |
42 | codePosition = ch; | |
43 | return state; | |
44 | } | |
45 | return (state = ERROR); | |
46 | } | |
47 | if((ch & 0x80) && (~ch & 0x40)) // 10xx xxxx | |
48 | { | |
49 | if(state==WAITING1 || state==WAITING2 || state==WAITING3) | |
50 | { | |
51 | codePosition = (codePosition << 6) | (ch & 0x3F); // 0x3F = 0011 1111 | |
52 | return (state = State(static_cast<int>(state)-1)); | |
53 | } | |
54 | return (state = ERROR); | |
55 | } | |
56 | if(state==DEFAULT) | |
57 | { | |
58 | ++count; | |
59 | if((ch & 0xC0) && (~ch & 0x20)) // 110x xxxx | |
60 | { | |
61 | codePosition = ch & 0x1F; // 0x1F = 0001 1111 | |
62 | return (state = WAITING1); | |
63 | } | |
64 | if((ch & 0xE0) && (~ch & 0x10)) // 1110 xxxx | |
65 | { | |
66 | codePosition = ch & 0x0F; // 0x0F = 0000 1111 | |
67 | return (state = WAITING2); | |
68 | } | |
69 | if((ch & 0xF0) && (~ch & 0x08)) // 1111 0xxx | |
70 | { | |
71 | codePosition = ch & 0x07; // 0x07 = 0000 0111 | |
72 | return (state = WAITING3); | |
73 | } | |
74 | --count; | |
75 | } | |
76 | return (state = ERROR); | |
77 | } | |
78 | State GetState() const | |
79 | { | |
80 | return state; | |
81 | } | |
82 | uint32_t GetCurrent() const | |
83 | { | |
84 | return codePosition; | |
85 | } | |
86 | long long GetCount() const | |
87 | { | |
88 | return count; | |
89 | } | |
90 | void Reset() | |
91 | { | |
92 | state = DEFAULT; | |
93 | codePosition = 0; | |
94 | count = 0; | |
95 | } | |
96 | private: | |
97 | State state; | |
98 | uint32_t codePosition; | |
99 | long long count; | |
100 | }; | |
101 | ||
102 | int main() | |
103 | { | |
104 | using namespace std; | |
105 | ||
106 | long long allSize = 0; | |
107 | unsigned char buf[BLOCK_SIZE]; | |
108 | int readedSize; | |
109 | ||
110 | UTF8Decoder decoder; | |
111 | while(true){ | |
112 | readedSize = fread(buf, 1, BLOCK_SIZE, stdin); | |
113 | unsigned char *end = buf + readedSize; | |
114 | for(unsigned char *i=buf; i<end; ++i) | |
115 | { | |
116 | if(decoder.Update(*i)==UTF8Decoder::ERROR) | |
117 | { | |
118 | DEBUGPRINT(decoder.GetState()); | |
119 | fprintf(stderr,"Invalid UTF8 character. Char pos: %lld\n",decoder.GetCount()); | |
120 | return -1; | |
121 | } | |
122 | if(decoder.GetState()==UTF8Decoder::DEFAULT) | |
123 | { | |
124 | // printf("%08X\n",decoder.GetCurrent()); | |
125 | printf("%c",static_cast<char>(decoder.GetCurrent())); | |
126 | } | |
127 | DEBUGPRINT(decoder.GetState()); | |
128 | } | |
129 | if(readedSize<BLOCK_SIZE) | |
130 | break; | |
131 | } | |
132 | if(decoder.GetState()!=UTF8Decoder::DEFAULT) | |
133 | { | |
134 | DEBUGPRINT(decoder.GetState()); | |
135 | fprintf(stderr,"Unfinished UTF8 character. Char pos: %lld\n",decoder.GetCount()); | |
136 | return -1; | |
137 | } | |
138 | allSize = decoder.GetCount(); | |
139 | fprintf(stderr,"Characters found: %lld\n",allSize); | |
140 | return 0; | |
141 | } | |
142 | # Reason of new version: Corrected stderr semantics. |