#include <cstdio>
#include <iostream>
// #include <cstdlib>
const int BLOCK_SIZE = 65536;
#ifdef DEBUG
#define CHECKPOINT { std::cerr << "CHECKPOINT: " << __FILE__ << " (" << __LINE__ << ")" << std::endl; }
#define DEBUGPRINT(x) { std::cerr << "DEBUG: " << __FILE__ << " (" << __LINE__ << "): " << #x << "= " << x << std::endl; }
#else
#define CHECKPOINT
#define DEBUGPRINT(x)
#endif
class UTF8Decoder
{
public:
enum State {
ERROR = -1
, DEFAULT
, WAITING1
, WAITING2
, WAITING3
};
UTF8Decoder()
: state(DEFAULT)
, codePosition(0)
, count(0)
{ }
State Update(unsigned char ch)
{
DEBUGPRINT((unsigned int)ch);
DEBUGPRINT(state);
if(state==ERROR)
return state;
if(~ch & 0x80)
{
if(state==DEFAULT)
{
++count;
codePosition = ch;
return state;
}
return (state = ERROR);
}
if((ch & 0x80) && (~ch & 0x40)) // 10xx xxxx
{
if(state==WAITING1 || state==WAITING2 || state==WAITING3)
{
codePosition = (codePosition << 6) | (ch & 0x3F); // 0x3F = 0011 1111
return (state = State(static_cast<int>(state)-1));
}
return (state = ERROR);
}
if(state==DEFAULT)
{
++count;
if((ch & 0xC0) && (~ch & 0x20)) // 110x xxxx
{
codePosition = ch & 0x1F; // 0x1F = 0001 1111
return (state = WAITING1);
}
if((ch & 0xE0) && (~ch & 0x10)) // 1110 xxxx
{
codePosition = ch & 0x0F; // 0x0F = 0000 1111
return (state = WAITING2);
}
if((ch & 0xF0) && (~ch & 0x08)) // 1111 0xxx
{
codePosition = ch & 0x07; // 0x07 = 0000 0111
return (state = WAITING3);
}
--count;
}
return (state = ERROR);
}
State GetState() const
{
return state;
}
uint32_t GetCurrent() const
{
return codePosition;
}
long long GetCount() const
{
return count;
}
void Reset()
{
state = DEFAULT;
codePosition = 0;
count = 0;
}
private:
State state;
uint32_t codePosition;
long long count;
};
int main()
{
using namespace std;
long long allSize = 0;
unsigned char buf[BLOCK_SIZE];
int readedSize;
UTF8Decoder decoder;
while(true){
readedSize = fread(buf, 1, BLOCK_SIZE, stdin);
unsigned char *end = buf + readedSize;
for(unsigned char *i=buf; i<end; ++i)
{
if(decoder.Update(*i)==UTF8Decoder::ERROR)
{
DEBUGPRINT(decoder.GetState());
fprintf(stderr,"Invalid UTF8 character. Char pos: %lld\n",decoder.GetCount());
return -1;
}
if(decoder.GetState()==UTF8Decoder::DEFAULT)
{
// printf("%08X\n",decoder.GetCurrent());
printf("%c",static_cast<char>(decoder.GetCurrent()));
}
DEBUGPRINT(decoder.GetState());
}
if(readedSize<BLOCK_SIZE)
break;
}
if(decoder.GetState()!=UTF8Decoder::DEFAULT)
{
DEBUGPRINT(decoder.GetState());
fprintf(stderr,"Unfinished UTF8 character. Char pos: %lld\n",decoder.GetCount());
return -1;
}
allSize = decoder.GetCount();
fprintf(stderr,"Characters found: %lld\n",allSize);
return 0;
}
# Reason of new version: Corrected stderr semantics.