Guest User

Untitled

a guest
Aug 15th, 2018
87
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.00 KB | None | 0 0
  1. #include "utf8.h"
  2.  
  3. inline static unsigned short xml_encode_iso_8859_1(unsigned char);
  4. inline static char xml_decode_iso_8859_1(unsigned short);
  5. inline static unsigned short xml_encode_us_ascii(unsigned char);
  6. inline static char xml_decode_us_ascii(unsigned short);
  7.  
  8. static void *emalloc(size_t size)
  9. {
  10. void *p = malloc(size);
  11. if (p == NULL) {
  12. fprintf(stderr, "Out of memory!\n");
  13. exit(1);
  14. }
  15.  
  16. return p;
  17. }
  18.  
  19. static void *erealloc(void *ptr, size_t size)
  20. {
  21. void *p = realloc(ptr, size);
  22. if (p == NULL) {
  23. fprintf(stderr, "Out of memory!\n");
  24. exit(1);
  25. }
  26.  
  27. return p;
  28. }
  29.  
  30. /* All the encoding functions are set to NULL right now, since all
  31. * the encoding is currently done internally by expat/xmltok.
  32. */
  33. xml_encoding xml_encodings[] = {
  34. { "ISO-8859-1", xml_decode_iso_8859_1, xml_encode_iso_8859_1 },
  35. { "US-ASCII", xml_decode_us_ascii, xml_encode_us_ascii },
  36. { "UTF-8", NULL, NULL },
  37. { NULL, NULL, NULL }
  38. };
  39.  
  40. inline static unsigned short xml_encode_iso_8859_1(unsigned char c)
  41. {
  42. return (unsigned short)c;
  43. }
  44.  
  45. inline static char xml_decode_iso_8859_1(unsigned short c)
  46. {
  47. return (char)(c > 0xff ? '?' : c);
  48. }
  49.  
  50. inline static unsigned short xml_encode_us_ascii(unsigned char c)
  51. {
  52. return (unsigned short)c;
  53. }
  54.  
  55. inline static char xml_decode_us_ascii(unsigned short c)
  56. {
  57. return (char)(c > 0x7f ? '?' : c);
  58. }
  59.  
  60. static xml_encoding *xml_get_encoding(const XML_Char *name)
  61. {
  62. xml_encoding *enc = &xml_encodings[0];
  63.  
  64. while (enc && enc->name) {
  65. if (strcasecmp(name, enc->name) == 0)
  66. return enc;
  67. enc++;
  68. }
  69. return NULL;
  70. }
  71.  
  72. static char *xml_utf8_encode(const char *s, int len, int *newlen,
  73. const XML_Char *encoding)
  74. {
  75. int pos = len;
  76. char *newbuf;
  77. unsigned int c;
  78. unsigned short (*encoder)(unsigned char) = NULL;
  79. xml_encoding *enc = xml_get_encoding(encoding);
  80.  
  81. *newlen = 0;
  82. if (enc)
  83. encoder = enc->encoding_function;
  84. else
  85. /* If the target encoding was unknown, fail */
  86. return NULL;
  87.  
  88. if (encoder == NULL) {
  89. /* If no encoder function was specified, return the data as-is.
  90. */
  91. newbuf = (char*)emalloc(len + 1);
  92. memcpy(newbuf, s, len);
  93. *newlen = len;
  94. newbuf[*newlen] = '\0';
  95. return newbuf;
  96. }
  97.  
  98. /* This is the theoretical max (will never get beyond len * 2 as long
  99. * as we are converting from single-byte characters, though) */
  100. newbuf = emalloc(len);
  101. while (pos > 0) {
  102. c = encoder ? encoder((unsigned char)(*s)) : (unsigned short)(*s);
  103.  
  104. if (c < 0x80)
  105. newbuf[(*newlen)++] = (char) c;
  106. else if (c < 0x800) {
  107. newbuf[(*newlen)++] = (0xc0 | (c >> 6));
  108. newbuf[(*newlen)++] = (0x80 | (c & 0x3f));
  109. }
  110. else if (c < 0x10000) {
  111. newbuf[(*newlen)++] = (0xe0 | (c >> 12));
  112. newbuf[(*newlen)++] = (0xc0 | ((c >> 6) & 0x3f));
  113. newbuf[(*newlen)++] = (0x80 | (c & 0x3f));
  114. }
  115. else if (c < 0x200000) {
  116. newbuf[(*newlen)++] = (0xf0 | (c >> 18));
  117. newbuf[(*newlen)++] = (0xe0 | ((c >> 12) & 0x3f));
  118. newbuf[(*newlen)++] = (0xc0 | ((c >> 6) & 0x3f));
  119. newbuf[(*newlen)++] = (0x80 | (c & 0x3f));
  120. }
  121. pos--;
  122. s++;
  123. }
  124.  
  125. newbuf[*newlen] = 0;
  126. newbuf = erealloc(newbuf, (*newlen)+1);
  127. return newbuf;
  128. }
  129.  
  130. static char *xml_utf8_decode(const XML_Char *s, int len, int *newlen,
  131. const XML_Char *encoding)
  132. {
  133. int pos = len;
  134. char *newbuf = emalloc(len + 1);
  135. unsigned short c;
  136. char (*decoder)(unsigned short) = NULL;
  137. xml_encoding *enc = xml_get_encoding(encoding);
  138.  
  139. *newlen = 0;
  140. if (enc)
  141. decoder = enc->decoding_function;
  142.  
  143. if (decoder == NULL) {
  144. /* If the target encoding was unknown, or no decoder function
  145. * was specified, return the UTF-8-encoded data as-is.
  146. */
  147. memcpy(newbuf, s, len);
  148. *newlen = len;
  149. newbuf[*newlen] = '\0';
  150. return newbuf;
  151. }
  152.  
  153. while (pos > 0) {
  154. c = (unsigned char)(*s);
  155. if (c >= 0xf0) { /* four bytes encoded, 21 bits */
  156. if(pos-4 >= 0)
  157. c = ((s[0]&7)<<18) | ((s[1]&63)<<12) | ((s[2]&63)<<6) | (s[3]&63);
  158. else
  159. c = '?';
  160.  
  161. s += 4;
  162. pos -= 4;
  163. }
  164. else if (c >= 0xe0) { /* three bytes encoded, 16 bits */
  165. if(pos-3 >= 0)
  166. c = ((s[0]&63)<<12) | ((s[1]&63)<<6) | (s[2]&63);
  167. else
  168. c = '?';
  169.  
  170. s += 3;
  171. pos -= 3;
  172. }
  173. else if (c >= 0xc0) { /* two bytes encoded, 11 bits */
  174. if(pos-2 >= 0)
  175. c = ((s[0]&63)<<6) | (s[1]&63);
  176. else
  177. c = '?';
  178.  
  179. s += 2;
  180. pos -= 2;
  181. }
  182. else {
  183. s++;
  184. pos--;
  185. }
  186.  
  187. newbuf[*newlen] = decoder ? decoder(c) : c;
  188. ++*newlen;
  189. }
  190.  
  191. if (*newlen < len)
  192. newbuf = erealloc(newbuf, *newlen + 1);
  193.  
  194. newbuf[*newlen] = '\0';
  195. return newbuf;
  196. }
  197.  
  198. /* Public function */
  199.  
  200. char *utf8_encode(const char *str)
  201. {
  202. char *out;
  203. if (strlen(str)) {
  204. int alen, len;
  205. alen = strlen(str);
  206. out = xml_utf8_encode(str, alen, &len, "ISO-8859-1");
  207. }
  208.  
  209. return out;
  210. }
  211.  
  212. char *utf8_decode(const char *str)
  213. {
  214. char *out;
  215. if (strlen(str)) {
  216. int alen, len;
  217. alen = strlen(str);
  218. out = xml_utf8_decode(str, alen, &len, "ISO-8859-1");
  219. }
  220.  
  221. return out;
  222. }
Add Comment
Please, Sign In to add comment