Advertisement
Guest User

Untitled

a guest
Jul 26th, 2016
46
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.01 KB | None | 0 0
  1. /* UTF-8 to ISO-8859-1/ISO-8859-15 mapper.
  2. * Return 0..255 for valid ISO-8859-15 code points, 256 otherwise.
  3. */
  4. static inline unsigned int to_latin9(const unsigned int code)
  5. {
  6. //printf("ncode = %d", code);
  7. /* Code points 0 to U+00FF are the same in both. */
  8. if (code < 256U) {
  9. return code;
  10. }
  11. switch (code) {
  12. case 0x0152U: return 188U; /* U+0152 = 0xBC: OE ligature */
  13. case 0x0153U: return 189U; /* U+0153 = 0xBD: oe ligature */
  14. case 0x0160U: return 166U; /* U+0160 = 0xA6: S with caron */
  15. case 0x0161U: return 168U; /* U+0161 = 0xA8: s with caron */
  16. case 0x0178U: return 190U; /* U+0178 = 0xBE: Y with diaresis */
  17. case 0x017DU: return 180U; /* U+017D = 0xB4: Z with caron */
  18. case 0x017EU: return 184U; /* U+017E = 0xB8: z with caron */
  19. case 0x20ACU: return 164U; /* U+20AC = 0xA4: Euro */
  20. default: return 256U;
  21. }
  22. }
  23.  
  24. /* Convert an UTF-8 string to ISO-8859-15.
  25. * All invalid sequences are ignored.
  26. * Note: output == input is allowed,
  27. * but input < output < input + length
  28. * is not.
  29. * Output has to have room for (length+1) chars, including the trailing NUL byte.
  30. */
  31. size_t utf8_to_latin9(char *const output, const char *const input, const size_t length)
  32. {
  33. unsigned char *out = (unsigned char *)output;
  34. const unsigned char *in = (const unsigned char *)input;
  35. const unsigned char *const end = (const unsigned char *)input + length;
  36. unsigned int c;
  37.  
  38. while (in < end)
  39. if (*in < 128)
  40. *(out++) = *(in++); /* Valid codepoint */
  41. else
  42. if (*in < 192)
  43. in++; /* 10000000 .. 10111111 are invalid */
  44. else
  45. if (*in < 224) { /* 110xxxxx 10xxxxxx */
  46. if (in + 1 >= end)
  47. break;
  48. if ((in[1] & 192U) == 128U) {
  49. c = to_latin9( (((unsigned int)(in[0] & 0x1FU)) << 6U)
  50. | ((unsigned int)(in[1] & 0x3FU)) );
  51. if (c < 256)
  52. *(out++) = c;
  53. }
  54. in += 2;
  55.  
  56. } else
  57. if (*in < 240) { /* 1110xxxx 10xxxxxx 10xxxxxx */
  58. if (in + 2 >= end)
  59. break;
  60. if ((in[1] & 192U) == 128U &&
  61. (in[2] & 192U) == 128U) {
  62. c = to_latin9( (((unsigned int)(in[0] & 0x0FU)) << 12U)
  63. | (((unsigned int)(in[1] & 0x3FU)) << 6U)
  64. | ((unsigned int)(in[2] & 0x3FU)) );
  65. if (c < 256)
  66. *(out++) = c;
  67. }
  68. in += 3;
  69.  
  70. } else
  71. if (*in < 248) { /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
  72. if (in + 3 >= end)
  73. break;
  74. if ((in[1] & 192U) == 128U &&
  75. (in[2] & 192U) == 128U &&
  76. (in[3] & 192U) == 128U) {
  77. c = to_latin9( (((unsigned int)(in[0] & 0x07U)) << 18U)
  78. | (((unsigned int)(in[1] & 0x3FU)) << 12U)
  79. | (((unsigned int)(in[2] & 0x3FU)) << 6U)
  80. | ((unsigned int)(in[3] & 0x3FU)) );
  81. if (c < 256)
  82. *(out++) = c;
  83. }
  84. in += 4;
  85.  
  86. } else
  87. if (*in < 252) { /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
  88. if (in + 4 >= end)
  89. break;
  90. if ((in[1] & 192U) == 128U &&
  91. (in[2] & 192U) == 128U &&
  92. (in[3] & 192U) == 128U &&
  93. (in[4] & 192U) == 128U) {
  94. c = to_latin9( (((unsigned int)(in[0] & 0x03U)) << 24U)
  95. | (((unsigned int)(in[1] & 0x3FU)) << 18U)
  96. | (((unsigned int)(in[2] & 0x3FU)) << 12U)
  97. | (((unsigned int)(in[3] & 0x3FU)) << 6U)
  98. | ((unsigned int)(in[4] & 0x3FU)) );
  99. if (c < 256)
  100. *(out++) = c;
  101. }
  102. in += 5;
  103.  
  104. } else
  105. if (*in < 254) { /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
  106. if (in + 5 >= end)
  107. break;
  108. if ((in[1] & 192U) == 128U &&
  109. (in[2] & 192U) == 128U &&
  110. (in[3] & 192U) == 128U &&
  111. (in[4] & 192U) == 128U &&
  112. (in[5] & 192U) == 128U) {
  113. c = to_latin9( (((unsigned int)(in[0] & 0x01U)) << 30U)
  114. | (((unsigned int)(in[1] & 0x3FU)) << 24U)
  115. | (((unsigned int)(in[2] & 0x3FU)) << 18U)
  116. | (((unsigned int)(in[3] & 0x3FU)) << 12U)
  117. | (((unsigned int)(in[4] & 0x3FU)) << 6U)
  118. | ((unsigned int)(in[5] & 0x3FU)) );
  119. if (c < 256)
  120. *(out++) = c;
  121. }
  122. in += 6;
  123.  
  124. } else
  125. in++; /* 11111110 and 11111111 are invalid */
  126.  
  127. /* Terminate the output string. */
  128. *out = '';
  129.  
  130. return (size_t)(out - (unsigned char *)output);
  131. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement