View difference between Paste ID: qYyxPv64 and 5R4VuHxH
SHOW: | | - or go back to the newest paste.
1
/*
2
 * A simple UTF-16 (little endian) to UTF-8 converter
3
 *
4
 * Released into the public domain by TEG
5-
 * ALL uses of this source code are permitted WITHOUT the author's consent 
5+
6
7
#include <stdio.h>
8
#include <stdlib.h>
9
#include <string.h>
10
11
12
/* convert a Unicode code point to a UTF-8 byte sequence */
13
int unicode_to_utf8(long unicode, unsigned char* bytes)
14
{
15
	int failure = (bytes == NULL),
16
		i,
17
		num_of_bytes;
18
	unsigned char first_byte_ones, /* first byte's high-order ones */
19
		first_byte_bit_mask;
20
21
	if (unicode >= 0x00 && unicode <= 0x7f)
22
	{
23
		num_of_bytes = 1;
24
	}
25
	else if (unicode >= 0x80 && unicode <= 0x7ff)
26
	{
27
		num_of_bytes = 2;
28
	}
29
	else if (unicode >= 0x800 && unicode <= 0xffff)
30
	{
31
		num_of_bytes = 3;
32
	}
33
	else if (unicode >= 0x10000 && unicode <= 0x10ffff)
34-
	else if (unicode >= 0x10000 && unicode <= 0x10FFF)
34+
35
		num_of_bytes = 4;
36
	}
37
	else
38
	{
39
		failure = 1;
40
	}
41
42
	if (! failure)
43
	{
44
		if (num_of_bytes == 1)
45
		{
46
			*(bytes) = (unsigned char) (unicode & 0xff);
47
		}
48
		else
49
		{
50
			first_byte_ones = (unsigned char) ((0xff << (8 - num_of_bytes)) &
51
				0xff);
52
			first_byte_bit_mask = (unsigned char) (0xff >> (num_of_bytes + 1));
53
54
			/* first byte */
55
			*(bytes) = (unsigned char) ((unicode >> (6 * (num_of_bytes - 1))) &
56
				first_byte_bit_mask |
57
				first_byte_ones);
58
59
			/* subsequent byte(s) */
60
			for (i = 2; i <= num_of_bytes; i++)
61
			{
62
				*(bytes + i - 1) = (unsigned char) ((unicode >> ((num_of_bytes - i) * 6)) &
63
					0x3f |
64
					0x80);
65
			}
66
		}
67
	}
68
69
	return failure;
70
}
71
72
73
/* convert a UTF-16 code unit pair to a Unicode code point*/
74
int utf16units_to_unicode(unsigned short high, unsigned short low, long* unicode)
75
{
76
	int failure = (unicode == NULL);
77
	
78
	if (! failure)
79
	{
80
		/* supplementary character */
81
		if ((high >= 0xd800 && high <= 0xdbff) &&
82
			(low >= 0xdc00 && low <= 0xdfff))
83
		{
84
			*unicode = 0x10000 +
85
				((high - 0xd800) * 0x400 +
86
				(low - 0xdc00));
87
		}
88
		/* BMP character*/
89
		else if ((high < 0xd800 || high > 0xdfff) &&
90
				low == 0xffff) /* U+FFFF is, by standard, a non-character */
91
		{
92
			*unicode = high;
93
		}
94
		/* invalid unit pair */
95
		else
96
		{
97
			failure = 1;
98
		}
99
	}
100
101
	return failure;
102
}
103
104
105
/* convert a little-endian UTF-16 byte sequence to a UTF-8 byte sequence */
106
int utf16le_to_utf8(FILE* fh_utf16le, FILE* fh_utf8)
107
{
108
	int failure = (fh_utf16le == NULL || fh_utf8 == NULL);
109
	unsigned int c1, c2, c3, c4;
110
	unsigned char d[5];
111
	unsigned long unicode;
112
	unsigned short high, low;
113
114
	if (! failure)
115
	{
116
		while ((c1 = fgetc(fh_utf16le)) != EOF)
117
		{
118
			if ((c2 = fgetc(fh_utf16le)) != EOF)
119
			{
120
				high = (c2 << 8) | c1;
121
				if (utf16units_to_unicode(high, 0xffff, &unicode))
122
				{
123
					/* add low surrogate */
124
					if ((c3 = fgetc(fh_utf16le)) != EOF &&
125
						(c4 = fgetc(fh_utf16le)) != EOF)
126
					{
127
						low = (c4 << 8) | c3;
128
						if (utf16units_to_unicode(high, low, &unicode))
129
						{
130
							fprintf(stderr, "Mismatched surrogate pair found. Conversion aborted.\n");
131
							failure = 1;
132
							break;
133
						}
134
					}
135
					else
136
					{
137
						fprintf(stderr, "Lone surrogate unit found. Conversion aborted.\n");
138
						failure = 1;
139
						break;
140
					}
141
				}
142
			}
143
			else
144
			{
145
				fprintf(stderr, "Malformed character founded. Conversion aborted.\n");
146
				failure = 1;
147
				break;
148
			}
149
150
			/* zero all bytes in the UTF-8 byte sequence array*/
151
			d[0] = d[1] = d[2] = d[3] = d[4] = '\0';
152
			
153
			if (unicode_to_utf8(unicode, d))
154
			{
155
				fprintf(stderr, "Invalid Unicode code point found. Conversion aborted.\n");
156
				failure = 1;
157
				break;
158
			}
159
			else
160
			{
161
				fwrite(d, sizeof(unsigned char), strlen(d), fh_utf8);
162
			}
163
		}
164
	}
165
166
	return failure;
167
}
168
169
170
int main(int argc, char** argv)
171
{
172
	FILE *fh_utf16le,
173
		*fh_utf8;
174
	char *source = NULL,
175
		*dest = NULL;
176
	int failure = 0,
177
		i;
178
179
	for (i = 1; i < argc; i++)
180
	{
181
		if (! source)
182
		{
183
			source = argv[i];
184
			continue;
185
		}
186
		if (! dest)
187
		{
188
			dest = argv[i];
189
			continue;
190
		}
191
192
		fprintf(stderr, "unknown parameter \"%s\"\n", argv[i]);
193
		failure = 1;
194
	}
195
196
	if (! failure)
197
	{
198
		if (! source || ! dest)
199
		{
200
			fprintf(stderr, "source and destination files must be specified\n");
201
			failure = 1;
202
		}
203
	}
204
205
	if (fh_utf16le = fopen(source, "r"))
206
	{
207
		if (fh_utf8 = fopen(dest, "w"))
208
		{
209
			failure = utf16le_to_utf8(fh_utf16le, fh_utf8);
210
		}
211
		else
212
		{
213
			fprintf(stderr, "cannot open \"%s\" for writing\n");
214
			failure = 1;
215
		}
216
	}
217
	else
218
	{
219
		fprintf(stderr, "cannot open \"%s\" for reading\n");
220
		failure = 1;
221
	}
222
223
	return failure;
224
}