View difference between Paste ID: <a href="/qYyxPv64">qYyxPv64</a> and <a href="/5R4VuHxH">5R4VuHxH</a>

/*
1		/*
2		* A simple UTF-16 (little endian) to UTF-8 converter
3		*
4		* Released into the public domain by TEG
5	-	* ALL uses of this source code are permitted WITHOUT the author's consent
5	+
6
7		#include <stdio.h>
8		#include <stdlib.h>
9		#include <string.h>
10
11
12		/* convert a Unicode code point to a UTF-8 byte sequence */
13		int unicode_to_utf8(long unicode, unsigned char* bytes)
14		{
15		int failure = (bytes == NULL),
16		i,
17		num_of_bytes;
18		unsigned char first_byte_ones, /* first byte's high-order ones */
19		first_byte_bit_mask;
20
21		if (unicode >= 0x00 && unicode <= 0x7f)
22		{
23		num_of_bytes = 1;
24		}
25		else if (unicode >= 0x80 && unicode <= 0x7ff)
26		{
27		num_of_bytes = 2;
28		}
29		else if (unicode >= 0x800 && unicode <= 0xffff)
30		{
31		num_of_bytes = 3;
32		}
33		else if (unicode >= 0x10000 && unicode <= 0x10ffff)
34	-	else if (unicode >= 0x10000 && unicode <= 0x10FFF)
34	+
35		num_of_bytes = 4;
36		}
37		else
38		{
39		failure = 1;
40		}
41
42		if (! failure)
43		{
44		if (num_of_bytes == 1)
45		{
46		*(bytes) = (unsigned char) (unicode & 0xff);
47		}
48		else
49		{
50		first_byte_ones = (unsigned char) ((0xff << (8 - num_of_bytes)) &
51		0xff);
52		first_byte_bit_mask = (unsigned char) (0xff >> (num_of_bytes + 1));
53
54		/* first byte */
55		(bytes) = (unsigned char) ((unicode >> (6 (num_of_bytes - 1))) &
56		first_byte_bit_mask \|
57		first_byte_ones);
58
59		/* subsequent byte(s) */
60		for (i = 2; i <= num_of_bytes; i++)
61		{
62		(bytes + i - 1) = (unsigned char) ((unicode >> ((num_of_bytes - i) 6)) &
63		0x3f \|
64		0x80);
65		}
66		}
67		}
68
69		return failure;
70		}
71
72
73		/* convert a UTF-16 code unit pair to a Unicode code point*/
74		int utf16units_to_unicode(unsigned short high, unsigned short low, long* unicode)
75		{
76		int failure = (unicode == NULL);
77
78		if (! failure)
79		{
80		/* supplementary character */
81		if ((high >= 0xd800 && high <= 0xdbff) &&
82		(low >= 0xdc00 && low <= 0xdfff))
83		{
84		*unicode = 0x10000 +
85		((high - 0xd800) * 0x400 +
86		(low - 0xdc00));
87		}
88		/* BMP character*/
89		else if ((high < 0xd800 \|\| high > 0xdfff) &&
90		low == 0xffff) /* U+FFFF is, by standard, a non-character */
91		{
92		*unicode = high;
93		}
94		/* invalid unit pair */
95		else
96		{
97		failure = 1;
98		}
99		}
100
101		return failure;
102		}
103
104
105		/* convert a little-endian UTF-16 byte sequence to a UTF-8 byte sequence */
106		int utf16le_to_utf8(FILE* fh_utf16le, FILE* fh_utf8)
107		{
108		int failure = (fh_utf16le == NULL \|\| fh_utf8 == NULL);
109		unsigned int c1, c2, c3, c4;
110		unsigned char d[5];
111		unsigned long unicode;
112		unsigned short high, low;
113
114		if (! failure)
115		{
116		while ((c1 = fgetc(fh_utf16le)) != EOF)
117		{
118		if ((c2 = fgetc(fh_utf16le)) != EOF)
119		{
120		high = (c2 << 8) \| c1;
121		if (utf16units_to_unicode(high, 0xffff, &unicode))
122		{
123		/* add low surrogate */
124		if ((c3 = fgetc(fh_utf16le)) != EOF &&
125		(c4 = fgetc(fh_utf16le)) != EOF)
126		{
127		low = (c4 << 8) \| c3;
128		if (utf16units_to_unicode(high, low, &unicode))
129		{
130		fprintf(stderr, "Mismatched surrogate pair found. Conversion aborted.\n");
131		failure = 1;
132		break;
133		}
134		}
135		else
136		{
137		fprintf(stderr, "Lone surrogate unit found. Conversion aborted.\n");
138		failure = 1;
139		break;
140		}
141		}
142		}
143		else
144		{
145		fprintf(stderr, "Malformed character founded. Conversion aborted.\n");
146		failure = 1;
147		break;
148		}
149
150		/* zero all bytes in the UTF-8 byte sequence array*/
151		d[0] = d[1] = d[2] = d[3] = d[4] = '\0';
152
153		if (unicode_to_utf8(unicode, d))
154		{
155		fprintf(stderr, "Invalid Unicode code point found. Conversion aborted.\n");
156		failure = 1;
157		break;
158		}
159		else
160		{
161		fwrite(d, sizeof(unsigned char), strlen(d), fh_utf8);
162		}
163		}
164		}
165
166		return failure;
167		}
168
169
170		int main(int argc, char** argv)
171		{
172		FILE *fh_utf16le,
173		*fh_utf8;
174		char *source = NULL,
175		*dest = NULL;
176		int failure = 0,
177		i;
178
179		for (i = 1; i < argc; i++)
180		{
181		if (! source)
182		{
183		source = argv[i];
184		continue;
185		}
186		if (! dest)
187		{
188		dest = argv[i];
189		continue;
190		}
191
192		fprintf(stderr, "unknown parameter \"%s\"\n", argv[i]);
193		failure = 1;
194		}
195
196		if (! failure)
197		{
198		if (! source \|\| ! dest)
199		{
200		fprintf(stderr, "source and destination files must be specified\n");
201		failure = 1;
202		}
203		}
204
205		if (fh_utf16le = fopen(source, "r"))
206		{
207		if (fh_utf8 = fopen(dest, "w"))
208		{
209		failure = utf16le_to_utf8(fh_utf16le, fh_utf8);
210		}
211		else
212		{
213		fprintf(stderr, "cannot open \"%s\" for writing\n");
214		failure = 1;
215		}
216		}
217		else
218		{
219		fprintf(stderr, "cannot open \"%s\" for reading\n");
220		failure = 1;
221		}
222
223		return failure;
224		}