Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /*
- * A simple UTF-16 (little endian) to UTF-8 converter
- *
- * Released into the public domain by TEG
- */
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- /* convert a Unicode code point to a UTF-8 byte sequence */
- int unicode_to_utf8(long unicode, unsigned char* bytes)
- {
- int failure = (bytes == NULL),
- i,
- num_of_bytes;
- unsigned char first_byte_ones, /* first byte's high-order ones */
- first_byte_bit_mask;
- if (unicode >= 0x00 && unicode <= 0x7f)
- {
- num_of_bytes = 1;
- }
- else if (unicode >= 0x80 && unicode <= 0x7ff)
- {
- num_of_bytes = 2;
- }
- else if (unicode >= 0x800 && unicode <= 0xffff)
- {
- num_of_bytes = 3;
- }
- else if (unicode >= 0x10000 && unicode <= 0x10ffff)
- {
- num_of_bytes = 4;
- }
- else
- {
- failure = 1;
- }
- if (! failure)
- {
- if (num_of_bytes == 1)
- {
- *(bytes) = (unsigned char) (unicode & 0xff);
- }
- else
- {
- first_byte_ones = (unsigned char) ((0xff << (8 - num_of_bytes)) &
- 0xff);
- first_byte_bit_mask = (unsigned char) (0xff >> (num_of_bytes + 1));
- /* first byte */
- *(bytes) = (unsigned char) ((unicode >> (6 * (num_of_bytes - 1))) &
- first_byte_bit_mask |
- first_byte_ones);
- /* subsequent byte(s) */
- for (i = 2; i <= num_of_bytes; i++)
- {
- *(bytes + i - 1) = (unsigned char) ((unicode >> ((num_of_bytes - i) * 6)) &
- 0x3f |
- 0x80);
- }
- }
- }
- return failure;
- }
- /* convert a UTF-16 code unit pair to a Unicode code point*/
- int utf16units_to_unicode(unsigned short high, unsigned short low, long* unicode)
- {
- int failure = (unicode == NULL);
- if (! failure)
- {
- /* supplementary character */
- if ((high >= 0xd800 && high <= 0xdbff) &&
- (low >= 0xdc00 && low <= 0xdfff))
- {
- *unicode = 0x10000 +
- ((high - 0xd800) * 0x400 +
- (low - 0xdc00));
- }
- /* BMP character*/
- else if ((high < 0xd800 || high > 0xdfff) &&
- low == 0xffff) /* U+FFFF is, by standard, a non-character */
- {
- *unicode = high;
- }
- /* invalid unit pair */
- else
- {
- failure = 1;
- }
- }
- return failure;
- }
- /* convert a little-endian UTF-16 byte sequence to a UTF-8 byte sequence */
- int utf16le_to_utf8(FILE* fh_utf16le, FILE* fh_utf8)
- {
- int failure = (fh_utf16le == NULL || fh_utf8 == NULL);
- unsigned int c1, c2, c3, c4;
- unsigned char d[5];
- unsigned long unicode;
- unsigned short high, low;
- if (! failure)
- {
- while ((c1 = fgetc(fh_utf16le)) != EOF)
- {
- if ((c2 = fgetc(fh_utf16le)) != EOF)
- {
- high = (c2 << 8) | c1;
- if (utf16units_to_unicode(high, 0xffff, &unicode))
- {
- /* add low surrogate */
- if ((c3 = fgetc(fh_utf16le)) != EOF &&
- (c4 = fgetc(fh_utf16le)) != EOF)
- {
- low = (c4 << 8) | c3;
- if (utf16units_to_unicode(high, low, &unicode))
- {
- fprintf(stderr, "Mismatched surrogate pair found. Conversion aborted.\n");
- failure = 1;
- break;
- }
- }
- else
- {
- fprintf(stderr, "Lone surrogate unit found. Conversion aborted.\n");
- failure = 1;
- break;
- }
- }
- }
- else
- {
- fprintf(stderr, "Malformed character founded. Conversion aborted.\n");
- failure = 1;
- break;
- }
- /* zero all bytes in the UTF-8 byte sequence array*/
- d[0] = d[1] = d[2] = d[3] = d[4] = '\0';
- if (unicode_to_utf8(unicode, d))
- {
- fprintf(stderr, "Invalid Unicode code point found. Conversion aborted.\n");
- failure = 1;
- break;
- }
- else
- {
- fwrite(d, sizeof(unsigned char), strlen(d), fh_utf8);
- }
- }
- }
- return failure;
- }
- int main(int argc, char** argv)
- {
- FILE *fh_utf16le,
- *fh_utf8;
- char *source = NULL,
- *dest = NULL;
- int failure = 0,
- i;
- for (i = 1; i < argc; i++)
- {
- if (! source)
- {
- source = argv[i];
- continue;
- }
- if (! dest)
- {
- dest = argv[i];
- continue;
- }
- fprintf(stderr, "unknown parameter \"%s\"\n", argv[i]);
- failure = 1;
- }
- if (! failure)
- {
- if (! source || ! dest)
- {
- fprintf(stderr, "source and destination files must be specified\n");
- failure = 1;
- }
- }
- if (fh_utf16le = fopen(source, "r"))
- {
- if (fh_utf8 = fopen(dest, "w"))
- {
- failure = utf16le_to_utf8(fh_utf16le, fh_utf8);
- }
- else
- {
- fprintf(stderr, "cannot open \"%s\" for writing\n");
- failure = 1;
- }
- }
- else
- {
- fprintf(stderr, "cannot open \"%s\" for reading\n");
- failure = 1;
- }
- return failure;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement