SHOW:
|
|
- or go back to the newest paste.
1 | /* | |
2 | * A simple UTF-16 (little endian) to UTF-8 converter | |
3 | * | |
4 | * Released into the public domain by TEG | |
5 | - | * ALL uses of this source code are permitted WITHOUT the author's consent |
5 | + | |
6 | ||
7 | #include <stdio.h> | |
8 | #include <stdlib.h> | |
9 | #include <string.h> | |
10 | ||
11 | ||
12 | /* convert a Unicode code point to a UTF-8 byte sequence */ | |
13 | int unicode_to_utf8(long unicode, unsigned char* bytes) | |
14 | { | |
15 | int failure = (bytes == NULL), | |
16 | i, | |
17 | num_of_bytes; | |
18 | unsigned char first_byte_ones, /* first byte's high-order ones */ | |
19 | first_byte_bit_mask; | |
20 | ||
21 | if (unicode >= 0x00 && unicode <= 0x7f) | |
22 | { | |
23 | num_of_bytes = 1; | |
24 | } | |
25 | else if (unicode >= 0x80 && unicode <= 0x7ff) | |
26 | { | |
27 | num_of_bytes = 2; | |
28 | } | |
29 | else if (unicode >= 0x800 && unicode <= 0xffff) | |
30 | { | |
31 | num_of_bytes = 3; | |
32 | } | |
33 | else if (unicode >= 0x10000 && unicode <= 0x10ffff) | |
34 | - | else if (unicode >= 0x10000 && unicode <= 0x10FFF) |
34 | + | |
35 | num_of_bytes = 4; | |
36 | } | |
37 | else | |
38 | { | |
39 | failure = 1; | |
40 | } | |
41 | ||
42 | if (! failure) | |
43 | { | |
44 | if (num_of_bytes == 1) | |
45 | { | |
46 | *(bytes) = (unsigned char) (unicode & 0xff); | |
47 | } | |
48 | else | |
49 | { | |
50 | first_byte_ones = (unsigned char) ((0xff << (8 - num_of_bytes)) & | |
51 | 0xff); | |
52 | first_byte_bit_mask = (unsigned char) (0xff >> (num_of_bytes + 1)); | |
53 | ||
54 | /* first byte */ | |
55 | *(bytes) = (unsigned char) ((unicode >> (6 * (num_of_bytes - 1))) & | |
56 | first_byte_bit_mask | | |
57 | first_byte_ones); | |
58 | ||
59 | /* subsequent byte(s) */ | |
60 | for (i = 2; i <= num_of_bytes; i++) | |
61 | { | |
62 | *(bytes + i - 1) = (unsigned char) ((unicode >> ((num_of_bytes - i) * 6)) & | |
63 | 0x3f | | |
64 | 0x80); | |
65 | } | |
66 | } | |
67 | } | |
68 | ||
69 | return failure; | |
70 | } | |
71 | ||
72 | ||
73 | /* convert a UTF-16 code unit pair to a Unicode code point*/ | |
74 | int utf16units_to_unicode(unsigned short high, unsigned short low, long* unicode) | |
75 | { | |
76 | int failure = (unicode == NULL); | |
77 | ||
78 | if (! failure) | |
79 | { | |
80 | /* supplementary character */ | |
81 | if ((high >= 0xd800 && high <= 0xdbff) && | |
82 | (low >= 0xdc00 && low <= 0xdfff)) | |
83 | { | |
84 | *unicode = 0x10000 + | |
85 | ((high - 0xd800) * 0x400 + | |
86 | (low - 0xdc00)); | |
87 | } | |
88 | /* BMP character*/ | |
89 | else if ((high < 0xd800 || high > 0xdfff) && | |
90 | low == 0xffff) /* U+FFFF is, by standard, a non-character */ | |
91 | { | |
92 | *unicode = high; | |
93 | } | |
94 | /* invalid unit pair */ | |
95 | else | |
96 | { | |
97 | failure = 1; | |
98 | } | |
99 | } | |
100 | ||
101 | return failure; | |
102 | } | |
103 | ||
104 | ||
105 | /* convert a little-endian UTF-16 byte sequence to a UTF-8 byte sequence */ | |
106 | int utf16le_to_utf8(FILE* fh_utf16le, FILE* fh_utf8) | |
107 | { | |
108 | int failure = (fh_utf16le == NULL || fh_utf8 == NULL); | |
109 | unsigned int c1, c2, c3, c4; | |
110 | unsigned char d[5]; | |
111 | unsigned long unicode; | |
112 | unsigned short high, low; | |
113 | ||
114 | if (! failure) | |
115 | { | |
116 | while ((c1 = fgetc(fh_utf16le)) != EOF) | |
117 | { | |
118 | if ((c2 = fgetc(fh_utf16le)) != EOF) | |
119 | { | |
120 | high = (c2 << 8) | c1; | |
121 | if (utf16units_to_unicode(high, 0xffff, &unicode)) | |
122 | { | |
123 | /* add low surrogate */ | |
124 | if ((c3 = fgetc(fh_utf16le)) != EOF && | |
125 | (c4 = fgetc(fh_utf16le)) != EOF) | |
126 | { | |
127 | low = (c4 << 8) | c3; | |
128 | if (utf16units_to_unicode(high, low, &unicode)) | |
129 | { | |
130 | fprintf(stderr, "Mismatched surrogate pair found. Conversion aborted.\n"); | |
131 | failure = 1; | |
132 | break; | |
133 | } | |
134 | } | |
135 | else | |
136 | { | |
137 | fprintf(stderr, "Lone surrogate unit found. Conversion aborted.\n"); | |
138 | failure = 1; | |
139 | break; | |
140 | } | |
141 | } | |
142 | } | |
143 | else | |
144 | { | |
145 | fprintf(stderr, "Malformed character founded. Conversion aborted.\n"); | |
146 | failure = 1; | |
147 | break; | |
148 | } | |
149 | ||
150 | /* zero all bytes in the UTF-8 byte sequence array*/ | |
151 | d[0] = d[1] = d[2] = d[3] = d[4] = '\0'; | |
152 | ||
153 | if (unicode_to_utf8(unicode, d)) | |
154 | { | |
155 | fprintf(stderr, "Invalid Unicode code point found. Conversion aborted.\n"); | |
156 | failure = 1; | |
157 | break; | |
158 | } | |
159 | else | |
160 | { | |
161 | fwrite(d, sizeof(unsigned char), strlen(d), fh_utf8); | |
162 | } | |
163 | } | |
164 | } | |
165 | ||
166 | return failure; | |
167 | } | |
168 | ||
169 | ||
170 | int main(int argc, char** argv) | |
171 | { | |
172 | FILE *fh_utf16le, | |
173 | *fh_utf8; | |
174 | char *source = NULL, | |
175 | *dest = NULL; | |
176 | int failure = 0, | |
177 | i; | |
178 | ||
179 | for (i = 1; i < argc; i++) | |
180 | { | |
181 | if (! source) | |
182 | { | |
183 | source = argv[i]; | |
184 | continue; | |
185 | } | |
186 | if (! dest) | |
187 | { | |
188 | dest = argv[i]; | |
189 | continue; | |
190 | } | |
191 | ||
192 | fprintf(stderr, "unknown parameter \"%s\"\n", argv[i]); | |
193 | failure = 1; | |
194 | } | |
195 | ||
196 | if (! failure) | |
197 | { | |
198 | if (! source || ! dest) | |
199 | { | |
200 | fprintf(stderr, "source and destination files must be specified\n"); | |
201 | failure = 1; | |
202 | } | |
203 | } | |
204 | ||
205 | if (fh_utf16le = fopen(source, "r")) | |
206 | { | |
207 | if (fh_utf8 = fopen(dest, "w")) | |
208 | { | |
209 | failure = utf16le_to_utf8(fh_utf16le, fh_utf8); | |
210 | } | |
211 | else | |
212 | { | |
213 | fprintf(stderr, "cannot open \"%s\" for writing\n"); | |
214 | failure = 1; | |
215 | } | |
216 | } | |
217 | else | |
218 | { | |
219 | fprintf(stderr, "cannot open \"%s\" for reading\n"); | |
220 | failure = 1; | |
221 | } | |
222 | ||
223 | return failure; | |
224 | } |