Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <cstdio>
- #include <iostream>
- // #include <cstdlib>
- const int BLOCK_SIZE = 65536;
- #ifdef DEBUG
- #define CHECKPOINT { std::cerr << "CHECKPOINT: " << __FILE__ << " (" << __LINE__ << ")" << std::endl; }
- #define DEBUGPRINT(x) { std::cerr << "DEBUG: " << __FILE__ << " (" << __LINE__ << "): " << #x << "= " << x << std::endl; }
- #else
- #define CHECKPOINT
- #define DEBUGPRINT(x)
- #endif
- class UTF8Decoder
- {
- public:
- enum State {
- ERROR = -1
- , DEFAULT
- , WAITING1
- , WAITING2
- , WAITING3
- };
- UTF8Decoder()
- : state(DEFAULT)
- , codePosition(0)
- , count(0)
- { }
- State Update(unsigned char ch)
- {
- DEBUGPRINT((unsigned int)ch);
- DEBUGPRINT(state);
- if(state==ERROR)
- return state;
- if(~ch & 0x80)
- {
- if(state==DEFAULT)
- {
- ++count;
- codePosition = ch;
- return state;
- }
- return (state = ERROR);
- }
- if((ch & 0x80) && (~ch & 0x40)) // 10xx xxxx
- {
- if(state==WAITING1 || state==WAITING2 || state==WAITING3)
- {
- codePosition = (codePosition << 6) | (ch & 0x3F); // 0x3F = 0011 1111
- return (state = State(static_cast<int>(state)-1));
- }
- return (state = ERROR);
- }
- if(state==DEFAULT)
- {
- ++count;
- if((ch & 0xC0) && (~ch & 0x20)) // 110x xxxx
- {
- codePosition = ch & 0x1F; // 0x1F = 0001 1111
- return (state = WAITING1);
- }
- if((ch & 0xE0) && (~ch & 0x10)) // 1110 xxxx
- {
- codePosition = ch & 0x0F; // 0x0F = 0000 1111
- return (state = WAITING2);
- }
- if((ch & 0xF0) && (~ch & 0x08)) // 1111 0xxx
- {
- codePosition = ch & 0x07; // 0x07 = 0000 0111
- return (state = WAITING3);
- }
- --count;
- }
- return (state = ERROR);
- }
- State GetState() const
- {
- return state;
- }
- uint32_t GetCurrent() const
- {
- return codePosition;
- }
- long long GetCount() const
- {
- return count;
- }
- void Reset()
- {
- state = DEFAULT;
- codePosition = 0;
- count = 0;
- }
- private:
- State state;
- uint32_t codePosition;
- long long count;
- };
- int main()
- {
- using namespace std;
- long long allSize = 0;
- unsigned char buf[BLOCK_SIZE];
- int readedSize;
- UTF8Decoder decoder;
- while(true){
- readedSize = fread(buf, 1, BLOCK_SIZE, stdin);
- unsigned char *end = buf + readedSize;
- for(unsigned char *i=buf; i<end; ++i)
- {
- if(decoder.Update(*i)==UTF8Decoder::ERROR)
- {
- DEBUGPRINT(decoder.GetState());
- fprintf(stderr,"Invalid UTF8 character. Char pos: %lld\n",decoder.GetCount());
- return -1;
- }
- if(decoder.GetState()==UTF8Decoder::DEFAULT)
- {
- // printf("%08X\n",decoder.GetCurrent());
- printf("%c",static_cast<char>(decoder.GetCurrent()));
- }
- DEBUGPRINT(decoder.GetState());
- }
- if(readedSize<BLOCK_SIZE)
- break;
- }
- if(decoder.GetState()!=UTF8Decoder::DEFAULT)
- {
- DEBUGPRINT(decoder.GetState());
- fprintf(stderr,"Unfinished UTF8 character. Char pos: %lld\n",decoder.GetCount());
- return -1;
- }
- allSize = decoder.GetCount();
- fprintf(stderr,"Characters found: %lld\n",allSize);
- return 0;
- }
- # Reason of new version: Corrected stderr semantics.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement