Guest User

Vectorized unsafe IP address parsing v3

a guest
Jul 30th, 2015
318
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #include <stdio.h>
  2. #include <time.h>
  3. #include <vector>
  4. #include <string>
  5. #include <algorithm>
  6. #include <smmintrin.h>
  7. #include <stdint.h>
  8. #include <string.h>
  9. #ifdef _MSC_VER
  10.     #define ALIGN(n) __declspec(align(n))
  11. #else
  12.     #define ALIGN(n) __attribute__((aligned(n)))
  13. #endif
  14.  
  15. typedef unsigned int UINT32;
  16.  
  17. //=============================================== original =================================================
  18.  
  19. inline UINT32 GetIP(const char *p)
  20. {
  21.     UINT32 dwIP=0,dwIP_Part=0;
  22.     while(true)
  23.     {
  24.         if(p[0] == 0)
  25.         {
  26.             dwIP = (dwIP << 8) | dwIP_Part;
  27.             break;
  28.         }
  29.         if(p[0]=='.')
  30.         {      
  31.             dwIP = (dwIP << 8) | dwIP_Part;                    
  32.             dwIP_Part = 0;
  33.            p++;
  34.         }
  35.         dwIP_Part = (dwIP_Part*10)+(p[0]-'0');
  36.         p++;
  37.     }
  38.     return dwIP;
  39. }
  40.  
  41. //=============================================== stgatilov =================================================
  42.  
  43. __m128i shuffleTable[65536];    //can be reduced 256x times, see solution by @IwillnotexistIdonotexist below
  44. void MyInit() {
  45.     memset(shuffleTable, -1, sizeof(shuffleTable));
  46.     int len[4];
  47.     for (len[0] = 1; len[0] <= 3; len[0]++)
  48.         for (len[1] = 1; len[1] <= 3; len[1]++)
  49.             for (len[2] = 1; len[2] <= 3; len[2]++)
  50.                 for (len[3] = 1; len[3] <= 3; len[3]++) {
  51.                     int slen = len[0] + len[1] + len[2] + len[3] + 4;
  52.                     int rem = 16 - slen;
  53.                     for (int rmask = 0; rmask < 1<<rem; rmask++) {
  54. //                    { int rmask = (1<<rem)-1;    //note: only maximal rmask is possible if strings are zero-padded
  55.                         int mask = 0;
  56.                         char shuf[16] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
  57.                         int pos = 0;
  58.                         for (int i = 0; i < 4; i++) {
  59.                             for (int j = 0; j < len[i]; j++) {
  60.                                 shuf[(3-i) * 4 + (len[i]-1-j)] = pos;
  61.                                 pos++;
  62.                             }
  63.                             mask ^= (1<<pos);
  64.                             pos++;
  65.                         }
  66.                         mask ^= (rmask<<slen);
  67.                         _mm_store_si128(&shuffleTable[mask], _mm_loadu_si128((__m128i*)shuf));
  68.                     }
  69.                 }
  70. }
  71. inline UINT32 MyGetIP(const char *str) {
  72.     __m128i input = _mm_lddqu_si128((const __m128i*)str);
  73. //  for (int i = 0; i < 16; i++) printf("%c ", input.m128i_u8[i]); printf("\n");
  74.     input = _mm_sub_epi8(input, _mm_set1_epi8('0'));
  75. //  for (int i = 0; i < 16; i++) printf("%d ", int(input.m128i_u8[i])); printf("\n");
  76.     __m128i cmp = input;
  77. //  for (int i = 0; i < 16; i++) printf("%c", cmp.m128i_i8[i] < 0 ? 'X' : '.'); printf("\n");
  78.     uint64_t mask = _mm_movemask_epi8(cmp);
  79. //  printf("%d\n", mask);
  80.     __m128i shuf = shuffleTable[mask];
  81. //  for (int i = 0; i < 16; i++) printf("%d ", int(shuf.m128i_i8[i])); printf("\n");
  82.     __m128i arr = _mm_shuffle_epi8(input, shuf);
  83. //  for (int i = 0; i < 16; i++) printf("%d ", int(arr.m128i_u8[i])); printf("\n");
  84.     __m128i coeffs = _mm_set_epi8(0, 100, 10, 1, 0, 100, 10, 1, 0, 100, 10, 1, 0, 100, 10, 1);
  85.     __m128i prod = _mm_maddubs_epi16(coeffs, arr);
  86. //  for (int i = 0; i < 8; i++) printf("%d ", int(prod.m128i_u16[i])); printf("\n");
  87.     prod = _mm_hadd_epi16(prod, prod);
  88. //  for (int i = 0; i < 4; i++) printf("%d ", int(prod.m128i_u16[i])); printf("\n");
  89.     prod = _mm_shuffle_epi8(prod, _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6, 4, 2, 0));
  90. //  for (int i = 0; i < 4; i++) printf("%d ", int(prod.m128i_u8[i])); printf("\n");
  91.     return _mm_extract_epi32(prod, 0);
  92. //  return (UINT32(_mm_extract_epi16(prod, 1)) << 16) + UINT32(_mm_extract_epi16(prod, 0));
  93. }
  94.  
  95. //==================================== @IwillnotexistIdonotexist ===============================================
  96. //lookup table is compressed to 4Kb
  97. //taken from http://pastebin.com/XJ7w9nq1/* DEEP MAGIC BEGINS HERE. */
  98.     return (unsigned)(v*0x008d981a)>>24;
  99. }
  100.  
  101. inline UINT32 MyGetIP_comp(const char *str) {
  102.   __m128i input = _mm_lddqu_si128((const __m128i*)str);   //"192.167.1.3"
  103.   uint64_t zm = _mm_movemask_epi8(_mm_cmpeq_epi8(input, _mm_setzero_si128()));
  104.   zm ^= (zm-1);                                           //(1<<12) - 1
  105.   input = _mm_sub_epi8(input, _mm_set1_epi8('0'));        //1 9 2 254 1 6 7 254 1 254 3 208 245 0 8 40
  106.   __m128i cmp = input;                                    //...X...X.X.XX...  (signs)
  107.   uint64_t mask = _mm_movemask_epi8(cmp);                 //6792 - magic index
  108.   mask &= zm;
  109.   uint64_t hashmask = perfecthash(mask);
  110.   __m128i shuf = ((const __m128i*)TBL)[hashmask];         //10 -1 -1 -1 8 -1 -1 -1 6 5 4 -1 2 1 0 -1
  111.   __m128i arr = _mm_shuffle_epi8(input, shuf);            //3 0 0 0 | 1 0 0 0 | 7 6 1 0 | 2 9 1 0
  112.   __m128i coeffs = _mm_set_epi8(0, 100, 10, 1, 0, 100, 10, 1, 0, 100, 10, 1, 0, 100, 10, 1);
  113.   __m128i prod = _mm_maddubs_epi16(coeffs, arr);          //3 0 | 1 0 | 67 100 | 92 100
  114.   prod = _mm_hadd_epi16(prod, prod);                      //3 | 1 | 167 | 192 | ? | ? | ? | ?
  115.   __m128i imm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6, 4, 2, 0);
  116.   prod = _mm_shuffle_epi8(prod, imm);                     //3 1 167 192 0 0 0 0 0 0 0 0 0 0 0 0
  117.   return _mm_extract_epi32(prod, 0);
  118. }
  119.  
  120. //========================================== TESTING =================================================
  121.  
  122. static const int CNT = 16<<10;
  123.  
  124. int main() {
  125.   MyInit();
  126.   printf("%08X\n", MyGetIP("192.167.1.3"));
  127.  
  128.   std::vector<std::string> samples;
  129.   for (int i = 0; i < CNT; i++) {
  130.     std::string str;
  131.     for (int j = 0; j < 4; j++) {
  132.       if (j) str += '.';
  133.       int x = rand() % 256;
  134.       char qq[16];
  135.       sprintf(qq, "%d", x);
  136.       str += qq;
  137.     }
  138.     samples.push_back(str);
  139.  
  140. //    printf("%s\n", samples[i].c_str());
  141.     auto a = GetIP(samples.back().c_str());
  142.     auto b = MyGetIP(samples.back().c_str());
  143.   auto c = MyGetIP_comp(samples.back().c_str());
  144.     if (a != b || a != c || b != c)
  145.       printf("%s: %08X vs %08X vs %08X\n", samples[i].c_str(), a, b, c);
  146.        
  147.   }
  148.  
  149.  
  150. {
  151.   int start = clock();
  152.   int sum = 0;
  153.   for (int i = 0; i < 1<<27; i++) {
  154.       const char *input = samples[(i+0) & (CNT-1)].c_str();
  155.       sum += MyGetIP(input);
  156.   }
  157.   int elapsed = clock() - start;
  158.   printf("Time = %0.3lf   (%d)\n", double(elapsed) / CLOCKS_PER_SEC, sum);
  159. }
  160. {
  161.   int start = clock();
  162.   int sum = 0;
  163.   for (int i = 0; i < 1<<27; i++) {
  164.       const char *input = samples[(i+0) & (CNT-1)].c_str();
  165.       sum += MyGetIP_comp(input);
  166.   }
  167.   int elapsed = clock() - start;
  168.   printf("Time = %0.3lf   (%d)\n", double(elapsed) / CLOCKS_PER_SEC, sum);
  169. }
  170. {
  171.   int start = clock();
  172.   int sum = 0;
  173.   for (int i = 0; i < 1<<27; i++) {
  174.       const char *input = samples[i & (CNT-1)].c_str();
  175.       sum += GetIP(input);
  176.   }
  177.   int elapsed = clock() - start;
  178.   printf("Time = %0.3lf   (%d)\n", double(elapsed) / CLOCKS_PER_SEC, sum);
  179. }
  180.  
  181.   return 0;
  182. }
RAW Paste Data