Advertisement
Guest User

Untitled

a guest
Apr 6th, 2020
167
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 12.30 KB | None | 0 0
  1. #include <stdio.h>
  2. #include <stdbool.h>
  3. #include <malloc.h>
  4. #include <string.h>
  5.  
  6. struct data_encoding {
  7.     unsigned char *data;
  8.     int size;
  9. };
  10.  
  11. struct data_utf_to_unicode {
  12.     unsigned int *data;
  13.     int size;
  14. };
  15.  
  16. bool add(unsigned int *a, int *size, int *position, unsigned int value) {
  17.     if (*position + 1 >= *size) {
  18.         unsigned int *tmp = realloc(a, sizeof(int) * (*size) * 2 + 1);
  19.         if (!tmp) {
  20.             perror("realloc() ");
  21.             free(a);
  22.             free(tmp);
  23.             return false;
  24.         } else {
  25.             a = tmp;
  26.         }
  27.         *size = *size * 2 + 1;
  28.     }
  29.     a[(*position)++] = value;
  30.     return true;
  31. }
  32.  
  33. unsigned char *read_bytes(const char *file_name, int *position) {
  34.     FILE *fin;
  35.     int size = 0;
  36.     if ((fin = fopen(file_name, "rb")) == NULL) {
  37.         perror("fopen() ");
  38.         return NULL;
  39.     }
  40.     unsigned char b;
  41.     unsigned char *buf = malloc(0);
  42.     while (fread(&b, 1, 1, fin)) {
  43.         buf[*position] = b;
  44.         (*position)++;
  45.         if (*position + 1 >= size) {
  46.             unsigned char *tmp = realloc(buf, sizeof(int) * size * 2 + 1);
  47.             if (!tmp) {
  48.                 perror("realloc() ");
  49.                 fclose(fin);
  50.                 free(buf);
  51.                 free(tmp);
  52.                 return (unsigned char *) 2;
  53.             } else {
  54.                 buf = tmp;
  55.             }
  56.             size = size * 2 + 1;
  57.         }
  58.     }
  59.     fclose(fin);
  60.     return buf;
  61. }
  62.  
  63.  
  64. bool equal_arr(const unsigned char *data, const unsigned char *encode_arr, int encode_arr_size) {
  65.     for (int i = 0; i < encode_arr_size; i++) {
  66.         if (data[i] != encode_arr[i]) {
  67.             return false;
  68.         }
  69.     }
  70.     return true;
  71. }
  72.  
  73.  
  74. struct data_utf_to_unicode utf8_to_unicode(const unsigned char *utf_encoding, int size, bool bom) {
  75.     unsigned int *unicode_encoding = malloc(0);
  76.     int i = 3 * bom;
  77.     int data_size = 0;
  78.     int j = 0;
  79.     while (i < size) {
  80.         unsigned int res = 0;
  81.         unsigned int c = utf_encoding[i++];
  82.         if (!(c >> 7u)) {
  83.             res = c;
  84.         } else {
  85.             if ((c & 224u) == 192) {
  86.                 unsigned int b2 = (c & 31u) * (1u << 6u);
  87.                 unsigned int b1 = utf_encoding[i++] & 63u;
  88.                 res = b1 + b2;
  89.             } else if ((c & 240u) == 224) {
  90.                 unsigned int b3 = (c & 15u) * (1u << 12u);
  91.                 unsigned int b2 = (utf_encoding[i++] & 63u) * (1u << 6u);
  92.                 unsigned int b1 = utf_encoding[i++] & 63u;
  93.                 res = b1 + b2 + b3;
  94.             } else if ((c & 248u) == 240) {
  95.                 unsigned int b4 = (c & 7u) * (1u << 18u);
  96.                 unsigned int b3 = (utf_encoding[i++] & 63u) * (1u << 12u);
  97.                 unsigned int b2 = (utf_encoding[i++] & 63u) * (1u << 6u);
  98.                 unsigned int b1 = (utf_encoding[i++] & 63u);
  99.                 res = b1 + b2 + b3 + b4;
  100.             }
  101.         }
  102.         add(unicode_encoding, &data_size, &j, res);
  103.     }
  104.     struct data_utf_to_unicode tmp = {unicode_encoding, j};
  105.     return tmp;
  106. }
  107.  
  108. struct data_encoding unicode_to_utf8_bom(const unsigned int *data, int size) {
  109.     unsigned char *output_encoding = malloc(sizeof(unsigned char) * 4 * (size + 1));
  110.     int position = 0;
  111.     output_encoding[position++] = 0xEF;
  112.     output_encoding[position++] = 0xBB;
  113.     output_encoding[position++] = 0xBF;
  114.     int i = 0;
  115.     while (i < size) {
  116.         unsigned char res = 0;
  117.         if (data[i] < 1u << 7u) {
  118.             res = data[i];
  119.             output_encoding[position++] = res;
  120.         } else if (data[i] < 1u << 11u) {
  121.             unsigned char b1 = (data[i] & 63u) + 128;
  122.             unsigned char b2 = ((data[i] >> 6u) & 31u) + 192;
  123.             output_encoding[position++] = b2;
  124.             output_encoding[position++] = b1;
  125.             //printf("%X %X ", b2, b1);
  126.         } else if (data[i] < 1u << 16u) {
  127.             unsigned char b1 = (data[i] & 63u) + 128;
  128.             unsigned char b2 = ((data[i] >> 6u) & 63u) + 128;
  129.             unsigned char b3 = ((data[i] >> 12u) & 15u) + 224;
  130.             output_encoding[position++] = b3;
  131.             output_encoding[position++] = b2;
  132.             output_encoding[position++] = b1;
  133.         } else if (data[i] < 1u << 21u) {
  134.             unsigned char b1 = (data[i] & 63u) + 128;
  135.             unsigned char b2 = ((data[i] >> 6u) & 63u) + 128;
  136.             unsigned char b3 = ((data[i] >> 12u) & 63u) + 128;
  137.             unsigned char b4 = ((data[i] >> 18u) & 7u) + 240;
  138.             output_encoding[position++] = b4;
  139.             output_encoding[position++] = b3;
  140.             output_encoding[position++] = b2;
  141.             output_encoding[position++] = b1;
  142.         }
  143.         i++;
  144.     }
  145.     struct data_encoding tmp = {output_encoding, position};
  146.     return tmp;
  147. }
  148.  
  149. struct data_utf_to_unicode utf16_to_unicode(const unsigned char *utf_encoding, int size, bool is_be) {
  150.     unsigned int* unicode_encoding = malloc(0);
  151.     int i = 2;
  152.     int j = 0;
  153.     int data_size = 0;
  154.     while (i < size) {
  155.         unsigned int c = utf_encoding[i++];
  156.         unsigned int p = utf_encoding[i++];
  157.         unsigned int tmp = 0;
  158.         if (is_be) {
  159.             tmp = (c << 8u) + p;
  160.         } else {
  161.             tmp = (p << 8u) + c;
  162.         }
  163.         unsigned int res = tmp;
  164.         if (tmp >= 55296) {
  165.             c = utf_encoding[i++];
  166.             p = utf_encoding[i++];
  167.             unsigned int tmp2 = 0;
  168.             if (is_be) {
  169.                 tmp2 = (c << 8u) + p;
  170.             } else {
  171.                 tmp2 = (p << 8u) + c;
  172.             }
  173.             unsigned int b1 = ((tmp - 55296) << 10u) + 65536;
  174.             unsigned int b2 = tmp2 & ((1u << 10u) - 1u);
  175.             res = b1 + b2;
  176.         }
  177.         add(unicode_encoding, &data_size, &j,  res);
  178.     }
  179.     struct data_utf_to_unicode tmp = {unicode_encoding, j};
  180.     return tmp;
  181. }
  182.  
  183. void put_byte(unsigned char *output_encoding, int *position, unsigned int res, bool is_be) {
  184.     if (is_be) {
  185.         output_encoding[(*position)++] = res >> 8u;
  186.         output_encoding[(*position)++] = res & ((1u << 8u) - 1);
  187.         // printf("%X %X ", res >> 8u, res & ((1u << 8u) - 1));
  188.     } else {
  189.         output_encoding[(*position)++] = res & ((1u << 8u) - 1);
  190.         output_encoding[(*position)++] = res >> 8u;
  191.         //printf("%X %X ", res & ((1u << 8u) - 1), res >> 8u);
  192.     }
  193. }
  194.  
  195. struct data_encoding unicode_to_utf16(const unsigned int *data, int size, bool is_be) {
  196.     unsigned char *output_encoding = malloc(sizeof(unsigned char) * 4 * (size + 1));
  197.     int i = 0;
  198.     int position = 0;
  199.     put_byte(output_encoding, &position, 0xFEFF, is_be);
  200.     while (i < size) {
  201.         unsigned int res = 0;
  202.         if ((data[i] < (1u << 16u) && (data[i] < 55296 || data[i] > 57343))) {
  203.             res = data[i];
  204.             put_byte(output_encoding, &position, res, is_be);
  205.  
  206.         } else {
  207.             unsigned int b1 = (data[i] >> 10u) - 64 + 55296;
  208.             unsigned int b2 = (data[i] & ((1u << 10u) - 1)) + 56320;
  209.             put_byte(output_encoding, &position, b1, is_be);
  210.             put_byte(output_encoding, &position, b2, is_be);
  211.  
  212.         }
  213.         i++;
  214.     }
  215.     struct data_encoding tmp = {output_encoding, position};
  216.     return tmp;
  217. }
  218.  
  219. struct data_utf_to_unicode utf32_to_unicode(const unsigned char *utf_encoding, int size, bool is_be) {
  220.     unsigned int *unicode_encoding = malloc(0);
  221.     int i = 4;
  222.     int j = 0;
  223.     int data_size = 0;
  224.     while (i < size) {
  225.         unsigned int a = utf_encoding[i++];
  226.         unsigned int b = utf_encoding[i++];
  227.         unsigned int c = utf_encoding[i++];
  228.         unsigned int d = utf_encoding[i++];
  229.         if (is_be) {
  230.             add(unicode_encoding, &data_size, &j, ((a << 24u) + (b << 16u) + (c << 8u) + d));
  231.         } else {
  232.             add(unicode_encoding, &data_size, &j, (d << 24u) + (c << 16u) + (b << 8u) + a);
  233.         }
  234.     }
  235.     struct data_utf_to_unicode tmp = {unicode_encoding, j};
  236.     return tmp;
  237. }
  238.  
  239. struct data_encoding unicode_to_utf32(const unsigned int *data, int size, bool is_be) {
  240.     unsigned char *output_encoding = malloc(sizeof(unsigned char) * 4 * (size + 1));
  241.     int i = 0;
  242.     int position = 0;
  243.     if (is_be) {
  244.         output_encoding[position++] = 0x00;
  245.         output_encoding[position++] = 0x00;
  246.         output_encoding[position++] = 0xFE;
  247.         output_encoding[position++] = 0xFF;
  248.     } else {
  249.         output_encoding[position++] = 0xFF;
  250.         output_encoding[position++] = 0xFE;
  251.         output_encoding[position++] = 0x00;
  252.         output_encoding[position++] = 0x00;
  253.     }
  254.     while (i < size) {
  255.         unsigned int x = data[i++];
  256.         unsigned int a = (x >> 24u) & 255u;
  257.         unsigned int b = (x >> 16u) & 255u;
  258.         unsigned int c = (x >> 8u) & 255u;
  259.         unsigned int d = x & 255u;
  260.         if (is_be) {
  261.             output_encoding[position++] = a;
  262.             output_encoding[position++] = b;
  263.             output_encoding[position++] = c;
  264.             output_encoding[position++] = d;
  265.         } else {
  266.             output_encoding[position++] = d;
  267.             output_encoding[position++] = c;
  268.             output_encoding[position++] = b;
  269.             output_encoding[position++] = a;
  270.         }
  271.     }
  272.     struct data_encoding tmp = {output_encoding, position};
  273.     return tmp;
  274. }
  275.  
  276. struct data_encoding read_encoding(const char *fin) {
  277.     int position = 0;
  278.     unsigned char *data = read_bytes(fin, &position);
  279.     struct data_encoding tmp = {data, position};
  280.     return tmp;
  281. }
  282.  
  283.  
  284. struct data_utf_to_unicode utf_to_unicode(unsigned char *data, int size) {
  285.     const unsigned char utf8_bom[3] = {0xEF, 0xBB, 0xBF};
  286.     const unsigned char utf16_bom_be[2] = {0xFE, 0xFF};
  287.     const unsigned char utf16_bom_le[2] = {0xFF, 0xFE};
  288.     const unsigned char utf32_bom_be[4] = {0x00, 0x00, 0xFE, 0xFF};
  289.     const unsigned char utf32_bom_le[4] = {0xFF, 0xFE, 0x00, 0x00};
  290.     if (equal_arr(data, utf32_bom_be, 4))return utf32_to_unicode(data, size, true); // "utf32_bom_be";
  291.     if (equal_arr(data, utf32_bom_le, 4))return utf32_to_unicode(data, size, false); //"utf32_bom_le";
  292.     if (equal_arr(data, utf16_bom_be, 2))return utf16_to_unicode(data, size, true); // "utf16_bom_be";
  293.     if (equal_arr(data, utf16_bom_le, 2)) return utf16_to_unicode(data, size, false); // "utf16_bom_le"
  294.     if (equal_arr(data, utf8_bom, 3))return utf8_to_unicode(data, size, true); // "utf8_bom";
  295.     return utf8_to_unicode(data, size, false); // "utf8";
  296. }
  297.  
  298. struct data_encoding unicode_to_utf(const char *output_encoding, unsigned int *unicode_encoding, int size) {
  299.     if (strcmp(output_encoding, "utf8") == 0) {
  300.         return unicode_to_utf8_bom(unicode_encoding, size);
  301.     }
  302.     if (strcmp(output_encoding, "utf16_be") == 0) {
  303.         return unicode_to_utf16(unicode_encoding, size, true);
  304.     }
  305.     if (strcmp(output_encoding, "utf16_le") == 0) {
  306.         return unicode_to_utf16(unicode_encoding, size, false);
  307.     }
  308.     if (strcmp(output_encoding, "utf32_be") == 0) {
  309.         return unicode_to_utf32(unicode_encoding, size, true);
  310.     }
  311.     if (strcmp(output_encoding, "utf32_le") == 0) {
  312.         return unicode_to_utf32(unicode_encoding, size, false);
  313.     }
  314. }
  315.  
  316. bool write_output_encoding(const char *file_name, unsigned char *output_data, int size) {
  317.     FILE *fout = NULL;
  318.     if ((fout = fopen(file_name, "wb")) == NULL) {
  319.         perror("fopen() ");
  320.         return false;
  321.     }
  322.     fwrite(output_data, sizeof(unsigned char), size, fout);
  323.     fclose(fout);
  324.     return true;
  325. }
  326.  
  327. int main(int argc, char **argv) {
  328.     const char *fin, *fout, *output_encoding;
  329.     fin = argv[1];
  330.     fout = argv[2];
  331.     output_encoding = argv[3];
  332.     // считывание кодировка
  333.     struct data_encoding first_data = read_encoding(fin);
  334.     // перекодировка в unicode
  335.     struct data_utf_to_unicode second_data = utf_to_unicode(first_data.data, first_data.size);
  336.     unsigned int *unicode_data = second_data.data;
  337.     int unicode_data_size = second_data.size;
  338.     // перекодировка в utf
  339.     struct data_encoding third_data = unicode_to_utf(output_encoding, unicode_data, unicode_data_size);
  340.     unsigned char *data = third_data.data;
  341.     // вывод кодировки
  342.     write_output_encoding(fout, third_data.data, third_data.size);
  343.     free(first_data.data);
  344.     free(second_data.data);
  345.     free(third_data.data);
  346.  
  347. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement