View difference between Paste ID: NFJUP0R5 and
SHOW: | | - or go back to the newest paste.
1-
1+
#include <cstdio>
2
#include <iostream>
3
// #include <cstdlib>
4
5
const int BLOCK_SIZE = 65536;
6
7
8
#ifdef DEBUG
9
#define CHECKPOINT { std::cerr << "CHECKPOINT: " << __FILE__ << " (" << __LINE__ << ")" << std::endl; }
10
#define DEBUGPRINT(x) { std::cerr << "DEBUG: " << __FILE__ << " (" << __LINE__ << "): " << #x << "= " << x << std::endl; }
11
#else
12
#define CHECKPOINT
13
#define DEBUGPRINT(x)
14
#endif
15
16
class UTF8Decoder
17
{
18
public:
19
	enum State {
20
		ERROR = -1
21
		, DEFAULT
22
		, WAITING1
23
		, WAITING2
24
		, WAITING3
25
	};
26
	UTF8Decoder()
27
	: state(DEFAULT)
28
	, codePosition(0)
29
	, count(0)
30
	{ }
31
	State Update(unsigned char ch)
32
	{
33
		DEBUGPRINT((unsigned int)ch);
34
		DEBUGPRINT(state);
35
		if(state==ERROR)
36
			return state;
37
		if(~ch & 0x80)
38
		{
39
			if(state==DEFAULT)
40
			{
41
				++count;
42
				codePosition = ch;
43
				return state;
44
			}
45
			return (state = ERROR);
46
		}
47
		if((ch & 0x80) && (~ch & 0x40)) // 10xx xxxx
48
		{
49
			if(state==WAITING1 || state==WAITING2 || state==WAITING3)
50
			{
51
				codePosition = (codePosition << 6) | (ch & 0x3F); // 0x3F = 0011 1111
52
				return (state = State(static_cast<int>(state)-1));
53
			}
54
			return (state = ERROR);
55
		}
56
		if(state==DEFAULT)
57
		{
58
			++count;
59
			if((ch & 0xC0) && (~ch & 0x20)) // 110x xxxx
60
			{
61
				codePosition = ch & 0x1F; // 0x1F = 0001 1111
62
				return (state = WAITING1);
63
			}
64
			if((ch & 0xE0) && (~ch & 0x10)) // 1110 xxxx
65
			{
66
				codePosition = ch & 0x0F; // 0x0F = 0000 1111
67
				return (state = WAITING2);
68
			}
69
			if((ch & 0xF0) && (~ch & 0x08)) // 1111 0xxx
70
			{
71
				codePosition = ch & 0x07; // 0x07 = 0000 0111
72
				return (state = WAITING3);
73
			}
74
			--count;
75
		}
76
		return (state = ERROR);
77
	}
78
	State GetState() const
79
	{
80
		return state;
81
	}
82
	uint32_t GetCurrent() const
83
	{
84
		return codePosition;
85
	}
86
	long long GetCount() const
87
	{
88
		return count;
89
	}
90
	void Reset()
91
	{
92
		state = DEFAULT;
93
		codePosition = 0;
94
		count = 0;
95
	}
96
private:
97
	State state;
98
	uint32_t codePosition;
99
	long long count;
100
};
101
102
int main()
103
{
104
	using namespace std;
105
	
106
	long long allSize = 0;
107
	unsigned char buf[BLOCK_SIZE];
108
	int readedSize;
109
	
110
	UTF8Decoder decoder;
111
	while(true){
112
		readedSize = fread(buf, 1, BLOCK_SIZE, stdin);
113
		unsigned char *end = buf + readedSize;
114
		for(unsigned char *i=buf; i<end; ++i)
115
		{
116
			if(decoder.Update(*i)==UTF8Decoder::ERROR)
117
			{
118
				DEBUGPRINT(decoder.GetState());
119
				fprintf(stderr,"Invalid UTF8 character. Char pos: %lld\n",decoder.GetCount());
120
				return -1;
121
			}
122
			if(decoder.GetState()==UTF8Decoder::DEFAULT)
123
			{
124
				// printf("%08X\n",decoder.GetCurrent());
125
				printf("%c",static_cast<char>(decoder.GetCurrent()));
126
			}
127
			DEBUGPRINT(decoder.GetState());
128
		}
129
		if(readedSize<BLOCK_SIZE)
130
			break;
131
	}
132
	if(decoder.GetState()!=UTF8Decoder::DEFAULT)
133
	{
134
		DEBUGPRINT(decoder.GetState());
135
		fprintf(stderr,"Unfinished UTF8 character. Char pos: %lld\n",decoder.GetCount());
136
		return -1;
137
	}
138
	allSize = decoder.GetCount();
139
	fprintf(stderr,"Characters found: %lld\n",allSize);
140
	return 0;
141
}
142
# Reason of new version: Corrected stderr semantics.