Advertisement
stgatilov

Vectorized in-register deduplication

Aug 3rd, 2015
874
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 21.78 KB | None | 0 0
  1. #include <stdio.h>
  2. #include <stdint.h>
  3. #include <assert.h>
  4. #include <time.h>
  5. #include <string.h>
  6. #include <vector>
  7. #include <algorithm>
  8. #include <tmmintrin.h>  //SSSE3 is required
  9. #ifdef __AVX__
  10.     #include <immintrin.h>
  11. #endif
  12.  
  13. #ifdef _MSC_VER
  14.     #define ALIGN(n) __declspec(align(n))
  15. #else
  16.     #define ALIGN(n) __attribute__((aligned(n)))
  17. #endif
  18.  
  19.  
  20. //============================ Solutions ==============================
  21.  
  22. #define MIN(a, b) ((a) < (b) ? (a) : (b))
  23. #define MAX(a, b) ((a) > (b) ? (a) : (b))
  24.  
  25. static inline void deduplicate4_trivial(const uint32_t *arr, uint32_t *res) {
  26.     res[0] = arr[0];
  27.     res[1] = arr[1];
  28.     res[2] = arr[2];
  29.     res[3] = arr[3];
  30.     //sorting network for 4 numbers, taken from https://en.wikipedia.org/wiki/Sorting_network
  31.     #define CMP(i, j) {\
  32.         uint32_t a = MIN(res[i], res[j]);\
  33.         uint32_t b = MAX(res[i], res[j]);\
  34.         res[i] = a;\
  35.         res[j] = b;\
  36.     }
  37.     CMP(0, 2);
  38.     CMP(1, 3);
  39.     CMP(0, 1);
  40.     CMP(2, 3);
  41.     CMP(1, 2);
  42.     //remove consecutive duplicates
  43.     size_t pos = 1;
  44.     if (res[0] != res[1])
  45.         res[pos++] = res[1];
  46.     if (res[1] != res[2])
  47.         res[pos++] = res[2];
  48.     if (res[2] != res[3])
  49.         res[pos++] = res[3];
  50.     //fill rest with zeros
  51.     while (pos < 4)
  52.         res[pos++] = 0;
  53. }
  54.  
  55. ALIGN(16) unsigned char lookup_bmi_chars[64][16] = {    //1Kb size
  56.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F},
  57.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
  58.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
  59.     {0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  60.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
  61.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  62.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  63.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  64.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
  65.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  66.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  67.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  68.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
  69.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  70.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  71.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  72.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
  73.     {0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  74.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
  75.     {0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  76.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  77.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  78.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  79.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  80.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  81.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  82.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  83.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  84.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  85.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  86.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  87.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  88.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
  89.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  90.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  91.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  92.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
  93.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  94.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  95.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  96.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
  97.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  98.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  99.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  100.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
  101.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  102.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  103.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  104.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  105.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  106.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  107.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  108.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  109.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  110.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  111.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  112.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  113.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  114.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  115.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  116.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  117.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  118.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  119.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}
  120. };
  121. static __m128i *const lookup_bmi = (__m128i*) &lookup_bmi_chars[0][0];
  122. #ifdef __AVX__
  123. static inline __m128i deduplicate4_ssse3_bmi2(__m128i abcd) {
  124.     __m128i bcda = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(0, 3, 2, 1));
  125.     __m128i cdab = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(1, 0, 3, 2));
  126.     unsigned int mask1 = _mm_movemask_epi8(_mm_cmpeq_epi32(abcd, bcda));
  127.     unsigned int mask2 = _mm_movemask_epi8(_mm_cmpeq_epi32(abcd, cdab));
  128.     unsigned int maskFull = (mask2 << 16U) + mask1;
  129.     //Note: BMI2 instruction used here!
  130.     unsigned int lutIndex = _pext_u32(maskFull, (1<<0) | (1<<4) | (1<<8) | (1<<12) | (1<<16) | (1<<20));
  131.     __m128i shuf = lookup_bmi[lutIndex];
  132.     return _mm_shuffle_epi8(abcd, shuf);
  133. }
  134. #endif
  135.  
  136. static const int HASH_BITS = 6;
  137. ALIGN(16) unsigned char lookup_hash_chars[1 << HASH_BITS][16] = {   //1Kb size
  138.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F},
  139.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
  140.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
  141.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  142.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
  143.     {0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  144.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  145.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  146.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
  147.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  148.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
  149.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  150.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  151.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  152.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  153.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  154.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
  155.     {0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  156.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  157.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  158.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
  159.     {0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  160.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  161.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  162.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  163.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  164.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  165.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  166.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  167.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  168.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  169.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  170.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
  171.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  172.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
  173.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  174.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  175.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  176.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  177.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  178.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
  179.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  180.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
  181.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  182.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  183.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  184.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  185.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  186.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  187.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  188.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  189.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  190.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  191.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  192.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  193.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  194.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  195.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  196.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  197.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  198.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  199.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  200.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
  201.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}
  202. };
  203. static __m128i *const lookup_hash = (__m128i*) &lookup_hash_chars[0][0];
  204.  
  205. static inline __m128i deduplicate4_ssse3(__m128i abcd) {
  206.     __m128i bcda = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(0, 3, 2, 1));
  207.     __m128i cdab = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(1, 0, 3, 2));
  208.     uint32_t mask1 = _mm_movemask_epi8(_mm_cmpeq_epi32(abcd, bcda));
  209.     uint32_t mask2 = _mm_movemask_epi8(_mm_cmpeq_epi32(abcd, cdab));
  210.     uint32_t maskFull = (mask2 << 16U) + mask1;
  211.     //Note: minimal perfect hash function here
  212.     uint32_t lutIndex = (maskFull * 0X0044CCCEU) >> 26U;
  213.     __m128i shuf = lookup_hash[lutIndex];
  214.     return _mm_shuffle_epi8(abcd, shuf);
  215. }
  216.  
  217.  
  218. static __m128i *const lookup_direct = (__m128i*) &lookup_bmi_chars[0][0];
  219. static __m128i *const lookup_direct_offset = lookup_direct - 0xC0U;
  220.  
  221. static inline __m128i deduplicate4_ssse3_direct(__m128i abcd) {
  222.     __m128i bcda = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(0, 3, 2, 1));
  223.     __m128i cdcd = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(3, 2, 3, 2));
  224.     uint32_t mask1 = _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(abcd, bcda)));
  225.     uint32_t mask2 = _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(abcd, cdcd)));
  226.     uint32_t maskFull = 16U * mask2 + mask1;
  227.     //Note: use index directly
  228.     uint32_t lutIndex = maskFull;
  229.     __m128i shuf = lookup_direct_offset[lutIndex];
  230.     return _mm_shuffle_epi8(abcd, shuf);
  231. }
  232.  
  233. //======================== Perfect hash generator =========================
  234.  
  235. typedef uint32_t hash_int;
  236.  
  237. //searching for a hash function of this kind:
  238. hash_int HashFunction(hash_int param, hash_int x, int bits) {
  239.     return (param * x) >> hash_int(32 - bits);
  240. }
  241.  
  242. static const hash_int NO_PARAM = hash_int(-1);
  243. hash_int GeneratePerfectHash(std::vector<hash_int> values, int bits, hash_int tries = NO_PARAM) {
  244.     std::vector<int> table;
  245.     for (hash_int param = 0; param < tries; param++) {
  246.         table.assign(hash_int(1)<<bits, 0);
  247.         bool ok = true;
  248.         for (int i = 0; ok && i < values.size(); i++) {
  249.             hash_int x = values[i];
  250.             hash_int cell = HashFunction(param, x, bits);
  251.             if (table[cell]++)
  252.                 ok = false;
  253.         }
  254.         if (ok)
  255.             return param;
  256.     }
  257.     return NO_PARAM;
  258. }
  259.  
  260. //======================== Lookup table generator =========================
  261.  
  262. void PrintTable(const __m128i *lut, int size) {
  263.     for (int i = 0; i < size; i++) {
  264.         const unsigned char *entry = (unsigned char*) &lut[i];
  265.         printf("\t{");
  266.         for (int j = 0; j < 16; j++) {
  267.             if (j) printf(", ");
  268.             printf("0x%02X", (unsigned int)(entry[j]));
  269.         }
  270.         printf("},\n");
  271.     }
  272.     printf("\n");
  273. }
  274.  
  275. void GenerateLookupTable() {
  276.     for (int minMask = 0; minMask < 64; minMask++) {
  277.         bool matr[4][4];
  278.         matr[0][1] = (minMask >> 0) & 1;
  279.         matr[1][2] = (minMask >> 1) & 1;
  280.         matr[2][3] = (minMask >> 2) & 1;
  281.         matr[0][3] = (minMask >> 3) & 1;
  282.         matr[0][2] = (minMask >> 4) & 1;
  283.         matr[1][3] = (minMask >> 5) & 1;
  284.         int order[4] = {-1, -1, -1, -1};
  285.         int pos = 0;
  286.         for (int i = 0; i < 4; i++) {
  287.             bool isNew = true;
  288.             for (int j = 0; j < i; j++)
  289.                 if (matr[j][i])
  290.                     isNew = false;
  291.             if (isNew)
  292.                 order[pos++] = i;
  293.         }
  294.         unsigned char *bytes = lookup_bmi_chars[minMask];
  295.         for (int i = 0; i < 4; i++)
  296.             for (int j = 0; j < 4; j++)
  297.                 bytes[4 * i + j] = (order[i] < 0 ? -1 : 4 * order[i] + j);
  298.     }
  299. }
  300.  
  301. uint32_t GenerateLookupTableHash() {
  302.     GenerateLookupTable();
  303.     std::vector<uint32_t> allMasks;
  304.     for (int minMask = 0; minMask < 64; minMask++) {
  305.         uint32_t fullMask = 0;
  306.         for (int i = 0; i < 6; i++)
  307.             if ((minMask >> i) & 1) {
  308.                 fullMask |= 0xFU << (4*i);
  309.                 if (i >= 4) //comparisons 4 and 6, 5 and 7 are same
  310.                     fullMask |= 0xFU << (4*(i+2));
  311.             }
  312.         allMasks.push_back(fullMask);
  313.     }
  314.     uint32_t param = GeneratePerfectHash(allMasks, HASH_BITS);
  315.     assert(param != NO_PARAM);
  316.     memset(lookup_hash, -1, sizeof(lookup_hash));
  317.     for (int i = 0; i < allMasks.size(); i++) {
  318.         uint32_t hash = HashFunction(param, allMasks[i], HASH_BITS);
  319.         lookup_hash[hash] = lookup_bmi[i];
  320.     }
  321.     return param;
  322. }
  323.  
  324. void GenAll() {
  325.     GenerateLookupTable();
  326.     printf("lookup_bmi:\n");
  327.     PrintTable(lookup_bmi, 64);
  328.  
  329.     uint32_t param = GenerateLookupTableHash();
  330.     printf("lookup_hash (%08X):\n", param);
  331.     PrintTable(lookup_hash, 1 << HASH_BITS);
  332. }
  333.  
  334. //=============================== Testing =================================
  335.  
  336. static const int CASES = 1<<14;
  337. static const int TRIES = 1<<29;
  338.  
  339. static union {
  340.     uint32_t input_i[CASES][4];
  341.     __m128i input_r[CASES];
  342. };
  343.  
  344. int main() {
  345. /*  GenAll();
  346.     return 0;*/
  347.  
  348.     for (int i = 0; i < CASES; i++) {
  349.         for (int j = 0; j < 4; j++)
  350.             input_i[i][j] = rand() * 13457U + rand();
  351.         int eq = rand() % 5;
  352.         for (int c = 0; c < eq; c++) {
  353.             int a = rand() % 4, b = rand() % 4;
  354.             input_i[i][a] = input_i[i][b];
  355.         }
  356.     }
  357.  
  358.     for (int i = 0; i < CASES; i++) {
  359.         uint32_t res0[4], res1[4], res2[4], res3[4];
  360.         uint32_t *ptr = input_i[i];
  361.         deduplicate4_trivial(ptr, res0);
  362.         _mm_storeu_si128((__m128i*)res1, deduplicate4_ssse3(input_r[i]));
  363.         _mm_storeu_si128((__m128i*)res2, deduplicate4_ssse3_direct(input_r[i]));
  364.        
  365.         memcpy(res3, res2, 16);
  366. #ifdef __AVX__
  367.         _mm_storeu_si128((__m128i*)res3, deduplicate4_ssse3_bmi2(input_r[i]));
  368. #endif
  369.         uint32_t srt0[4], srt1[4], srt2[4], srt3[4];
  370.         memcpy(srt0, res0, 16);
  371.         memcpy(srt1, res1, 16);
  372.         memcpy(srt2, res2, 16);
  373.         memcpy(srt3, res3, 16);
  374.         std::sort(srt0, srt0 + 4);
  375.         std::sort(srt1, srt1 + 4);
  376.         std::sort(srt2, srt2 + 4);
  377.         std::sort(srt3, srt3 + 4);
  378.         if (memcmp(srt0, srt1, 16) || memcmp(srt0, srt2, 16) || memcmp(srt0, srt3, 16)) {
  379.             printf("%d %d %d %d\n", ptr[0], ptr[1], ptr[2], ptr[3]);
  380.             printf("%d %d %d %d\n", res0[0], res0[1], res0[2], res0[3]);
  381.             printf("%d %d %d %d\n", res1[0], res1[1], res1[2], res1[3]);
  382.             printf("%d %d %d %d\n", res2[0], res2[1], res2[2], res2[3]);
  383.             printf("%d %d %d %d\n", res3[0], res3[1], res3[2], res3[3]);
  384.         }
  385.     }
  386.  
  387. #ifdef __AVX__
  388.     {
  389.         __m128i sum = _mm_setzero_si128();
  390.         int start = clock();
  391.         for (int i = 0; i < TRIES; i++) {
  392.             __m128i res = deduplicate4_ssse3_bmi2(input_r[i & (CASES-1)]);
  393.             sum = _mm_add_epi32(sum, res);
  394.         }
  395.         uint32_t tmp[4];
  396.         _mm_storeu_si128((__m128i*)tmp, sum);
  397.         printf("Time = %0.3lf   (%u)\n", double(clock() - start) / CLOCKS_PER_SEC, tmp[0] + tmp[1] + tmp[2] + tmp[3]);
  398.     }
  399. #endif
  400.     {
  401.         __m128i sum = _mm_setzero_si128();
  402.         int start = clock();
  403.         for (int i = 0; i < TRIES; i++) {
  404.             __m128i res = deduplicate4_ssse3_direct(input_r[i & (CASES-1)]);
  405.             sum = _mm_add_epi32(sum, res);
  406.         }
  407.         uint32_t tmp[4];
  408.         _mm_storeu_si128((__m128i*)tmp, sum);
  409.         printf("Time = %0.3lf   (%u)\n", double(clock() - start) / CLOCKS_PER_SEC, tmp[0] + tmp[1] + tmp[2] + tmp[3]);
  410.     }
  411.     {
  412.         __m128i sum = _mm_setzero_si128();
  413.         int start = clock();
  414.         for (int i = 0; i < TRIES; i++) {
  415.             __m128i res = deduplicate4_ssse3(input_r[i & (CASES-1)]);
  416.             sum = _mm_add_epi32(sum, res);
  417.         }
  418.         uint32_t tmp[4];
  419.         _mm_storeu_si128((__m128i*)tmp, sum);
  420.         printf("Time = %0.3lf   (%u)\n", double(clock() - start) / CLOCKS_PER_SEC, tmp[0] + tmp[1] + tmp[2] + tmp[3]);
  421.     }
  422.     {
  423.         uint32_t res[4];
  424.         uint32_t sum = 0;
  425.         int start = clock();
  426.         for (int i = 0; i < TRIES; i++) {
  427.             deduplicate4_trivial(input_i[i & (CASES-1)], res);
  428.             sum += res[0] + res[1] + res[2] + res[3];
  429.         }
  430.         printf("Time = %0.3lf   (%u)\n", double(clock() - start) / CLOCKS_PER_SEC, sum);
  431.     }
  432.  
  433.     return 0;
  434. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement