#include #include // #include const int BLOCK_SIZE = 65536; #ifdef DEBUG #define CHECKPOINT { std::cerr << "CHECKPOINT: " << __FILE__ << " (" << __LINE__ << ")" << std::endl; } #define DEBUGPRINT(x) { std::cerr << "DEBUG: " << __FILE__ << " (" << __LINE__ << "): " << #x << "= " << x << std::endl; } #else #define CHECKPOINT #define DEBUGPRINT(x) #endif class UTF8Decoder { public: enum State { ERROR = -1 , DEFAULT , WAITING1 , WAITING2 , WAITING3 }; UTF8Decoder() : state(DEFAULT) , codePosition(0) , count(0) { } State Update(unsigned char ch) { DEBUGPRINT((unsigned int)ch); DEBUGPRINT(state); if(state==ERROR) return state; if(~ch & 0x80) { if(state==DEFAULT) { ++count; codePosition = ch; return state; } return (state = ERROR); } if((ch & 0x80) && (~ch & 0x40)) // 10xx xxxx { if(state==WAITING1 || state==WAITING2 || state==WAITING3) { codePosition = (codePosition << 6) | (ch & 0x3F); // 0x3F = 0011 1111 return (state = State(static_cast(state)-1)); } return (state = ERROR); } if(state==DEFAULT) { ++count; if((ch & 0xC0) && (~ch & 0x20)) // 110x xxxx { codePosition = ch & 0x1F; // 0x1F = 0001 1111 return (state = WAITING1); } if((ch & 0xE0) && (~ch & 0x10)) // 1110 xxxx { codePosition = ch & 0x0F; // 0x0F = 0000 1111 return (state = WAITING2); } if((ch & 0xF0) && (~ch & 0x08)) // 1111 0xxx { codePosition = ch & 0x07; // 0x07 = 0000 0111 return (state = WAITING3); } --count; } return (state = ERROR); } State GetState() const { return state; } uint32_t GetCurrent() const { return codePosition; } long long GetCount() const { return count; } void Reset() { state = DEFAULT; codePosition = 0; count = 0; } private: State state; uint32_t codePosition; long long count; }; int main() { using namespace std; long long allSize = 0; unsigned char buf[BLOCK_SIZE]; int readedSize; UTF8Decoder decoder; while(true){ readedSize = fread(buf, 1, BLOCK_SIZE, stdin); unsigned char *end = buf + readedSize; for(unsigned char *i=buf; i(decoder.GetCurrent())); } DEBUGPRINT(decoder.GetState()); } if(readedSize