# Vectorized in-register deduplication

Aug 3rd, 2015
909
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
1. #include <stdio.h>
2. #include <stdint.h>
3. #include <assert.h>
4. #include <time.h>
5. #include <string.h>
6. #include <vector>
7. #include <algorithm>
8. #include <tmmintrin.h>  //SSSE3 is required
9. #ifdef __AVX__
10.     #include <immintrin.h>
11. #endif
12.
13. #ifdef _MSC_VER
14.     #define ALIGN(n) __declspec(align(n))
15. #else
16.     #define ALIGN(n) __attribute__((aligned(n)))
17. #endif
18.
19.
20. //============================ Solutions ==============================
21.
22. #define MIN(a, b) ((a) < (b) ? (a) : (b))
23. #define MAX(a, b) ((a) > (b) ? (a) : (b))
24.
25. static inline void deduplicate4_trivial(const uint32_t *arr, uint32_t *res) {
26.     res[0] = arr[0];
27.     res[1] = arr[1];
28.     res[2] = arr[2];
29.     res[3] = arr[3];
30.     //sorting network for 4 numbers, taken from https://en.wikipedia.org/wiki/Sorting_network
31.     #define CMP(i, j) {\
32.         uint32_t a = MIN(res[i], res[j]);\
33.         uint32_t b = MAX(res[i], res[j]);\
34.         res[i] = a;\
35.         res[j] = b;\
36.     }
37.     CMP(0, 2);
38.     CMP(1, 3);
39.     CMP(0, 1);
40.     CMP(2, 3);
41.     CMP(1, 2);
42.     //remove consecutive duplicates
43.     size_t pos = 1;
44.     if (res[0] != res[1])
45.         res[pos++] = res[1];
46.     if (res[1] != res[2])
47.         res[pos++] = res[2];
48.     if (res[2] != res[3])
49.         res[pos++] = res[3];
50.     //fill rest with zeros
51.     while (pos < 4)
52.         res[pos++] = 0;
53. }
54.
55. ALIGN(16) unsigned char lookup_bmi_chars[64][16] = {    //1Kb size
56.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F},
57.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
58.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
59.     {0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
60.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
61.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
62.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
63.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
64.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
65.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
66.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
67.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
68.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
69.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
70.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
71.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
72.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
73.     {0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
74.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
75.     {0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
76.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
77.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
78.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
79.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
80.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
81.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
82.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
83.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
84.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
85.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
86.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
87.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
88.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
89.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
90.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
91.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
92.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
93.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
94.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
95.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
96.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
97.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
98.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
99.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
100.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
101.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
102.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
103.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
104.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
105.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
106.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
107.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
108.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
109.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
110.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
111.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
112.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
113.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
114.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
115.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
116.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
117.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
118.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
119.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}
120. };
121. static __m128i *const lookup_bmi = (__m128i*) &lookup_bmi_chars[0][0];
122. #ifdef __AVX__
123. static inline __m128i deduplicate4_ssse3_bmi2(__m128i abcd) {
124.     __m128i bcda = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(0, 3, 2, 1));
125.     __m128i cdab = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(1, 0, 3, 2));
129.     //Note: BMI2 instruction used here!
130.     unsigned int lutIndex = _pext_u32(maskFull, (1<<0) | (1<<4) | (1<<8) | (1<<12) | (1<<16) | (1<<20));
131.     __m128i shuf = lookup_bmi[lutIndex];
132.     return _mm_shuffle_epi8(abcd, shuf);
133. }
134. #endif
135.
136. static const int HASH_BITS = 6;
137. ALIGN(16) unsigned char lookup_hash_chars[1 << HASH_BITS][16] = {   //1Kb size
138.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F},
139.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
140.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
141.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
142.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
143.     {0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
144.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
145.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
146.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
147.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
148.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
149.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
150.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
151.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
152.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
153.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
154.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
155.     {0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
156.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
157.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
158.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
159.     {0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
160.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
161.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
162.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
163.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
164.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
165.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
166.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
167.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
168.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
169.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
170.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
171.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
172.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
173.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
174.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
175.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
176.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
177.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
178.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
179.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
180.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
181.     {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
182.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
183.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
184.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
185.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
186.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
187.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
188.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
189.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
190.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
191.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
192.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
193.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
194.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
195.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
196.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
197.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
198.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
199.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
200.     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
201.     {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}
202. };
203. static __m128i *const lookup_hash = (__m128i*) &lookup_hash_chars[0][0];
204.
205. static inline __m128i deduplicate4_ssse3(__m128i abcd) {
206.     __m128i bcda = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(0, 3, 2, 1));
207.     __m128i cdab = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(1, 0, 3, 2));
211.     //Note: minimal perfect hash function here
212.     uint32_t lutIndex = (maskFull * 0X0044CCCEU) >> 26U;
213.     __m128i shuf = lookup_hash[lutIndex];
214.     return _mm_shuffle_epi8(abcd, shuf);
215. }
216.
217.
218. static __m128i *const lookup_direct = (__m128i*) &lookup_bmi_chars[0][0];
219. static __m128i *const lookup_direct_offset = lookup_direct - 0xC0U;
220.
221. static inline __m128i deduplicate4_ssse3_direct(__m128i abcd) {
222.     __m128i bcda = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(0, 3, 2, 1));
223.     __m128i cdcd = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(3, 2, 3, 2));
227.     //Note: use index directly
229.     __m128i shuf = lookup_direct_offset[lutIndex];
230.     return _mm_shuffle_epi8(abcd, shuf);
231. }
232.
233. //======================== Perfect hash generator =========================
234.
235. typedef uint32_t hash_int;
236.
237. //searching for a hash function of this kind:
238. hash_int HashFunction(hash_int param, hash_int x, int bits) {
239.     return (param * x) >> hash_int(32 - bits);
240. }
241.
242. static const hash_int NO_PARAM = hash_int(-1);
243. hash_int GeneratePerfectHash(std::vector<hash_int> values, int bits, hash_int tries = NO_PARAM) {
244.     std::vector<int> table;
245.     for (hash_int param = 0; param < tries; param++) {
246.         table.assign(hash_int(1)<<bits, 0);
247.         bool ok = true;
248.         for (int i = 0; ok && i < values.size(); i++) {
249.             hash_int x = values[i];
250.             hash_int cell = HashFunction(param, x, bits);
251.             if (table[cell]++)
252.                 ok = false;
253.         }
254.         if (ok)
255.             return param;
256.     }
257.     return NO_PARAM;
258. }
259.
260. //======================== Lookup table generator =========================
261.
262. void PrintTable(const __m128i *lut, int size) {
263.     for (int i = 0; i < size; i++) {
264.         const unsigned char *entry = (unsigned char*) &lut[i];
265.         printf("\t{");
266.         for (int j = 0; j < 16; j++) {
267.             if (j) printf(", ");
268.             printf("0x%02X", (unsigned int)(entry[j]));
269.         }
270.         printf("},\n");
271.     }
272.     printf("\n");
273. }
274.
275. void GenerateLookupTable() {
277.         bool matr[4][4];
278.         matr[0][1] = (minMask >> 0) & 1;
279.         matr[1][2] = (minMask >> 1) & 1;
280.         matr[2][3] = (minMask >> 2) & 1;
281.         matr[0][3] = (minMask >> 3) & 1;
282.         matr[0][2] = (minMask >> 4) & 1;
283.         matr[1][3] = (minMask >> 5) & 1;
284.         int order[4] = {-1, -1, -1, -1};
285.         int pos = 0;
286.         for (int i = 0; i < 4; i++) {
287.             bool isNew = true;
288.             for (int j = 0; j < i; j++)
289.                 if (matr[j][i])
290.                     isNew = false;
291.             if (isNew)
292.                 order[pos++] = i;
293.         }
294.         unsigned char *bytes = lookup_bmi_chars[minMask];
295.         for (int i = 0; i < 4; i++)
296.             for (int j = 0; j < 4; j++)
297.                 bytes[4 * i + j] = (order[i] < 0 ? -1 : 4 * order[i] + j);
298.     }
299. }
300.
301. uint32_t GenerateLookupTableHash() {
302.     GenerateLookupTable();
306.         for (int i = 0; i < 6; i++)
307.             if ((minMask >> i) & 1) {
308.                 fullMask |= 0xFU << (4*i);
309.                 if (i >= 4) //comparisons 4 and 6, 5 and 7 are same
310.                     fullMask |= 0xFU << (4*(i+2));
311.             }
313.     }
314.     uint32_t param = GeneratePerfectHash(allMasks, HASH_BITS);
315.     assert(param != NO_PARAM);
316.     memset(lookup_hash, -1, sizeof(lookup_hash));
317.     for (int i = 0; i < allMasks.size(); i++) {
318.         uint32_t hash = HashFunction(param, allMasks[i], HASH_BITS);
319.         lookup_hash[hash] = lookup_bmi[i];
320.     }
321.     return param;
322. }
323.
324. void GenAll() {
325.     GenerateLookupTable();
326.     printf("lookup_bmi:\n");
327.     PrintTable(lookup_bmi, 64);
328.
329.     uint32_t param = GenerateLookupTableHash();
330.     printf("lookup_hash (%08X):\n", param);
331.     PrintTable(lookup_hash, 1 << HASH_BITS);
332. }
333.
334. //=============================== Testing =================================
335.
336. static const int CASES = 1<<14;
337. static const int TRIES = 1<<29;
338.
339. static union {
340.     uint32_t input_i[CASES][4];
341.     __m128i input_r[CASES];
342. };
343.
344. int main() {
345. /*  GenAll();
346.     return 0;*/
347.
348.     for (int i = 0; i < CASES; i++) {
349.         for (int j = 0; j < 4; j++)
350.             input_i[i][j] = rand() * 13457U + rand();
351.         int eq = rand() % 5;
352.         for (int c = 0; c < eq; c++) {
353.             int a = rand() % 4, b = rand() % 4;
354.             input_i[i][a] = input_i[i][b];
355.         }
356.     }
357.
358.     for (int i = 0; i < CASES; i++) {
359.         uint32_t res0[4], res1[4], res2[4], res3[4];
360.         uint32_t *ptr = input_i[i];
361.         deduplicate4_trivial(ptr, res0);
362.         _mm_storeu_si128((__m128i*)res1, deduplicate4_ssse3(input_r[i]));
363.         _mm_storeu_si128((__m128i*)res2, deduplicate4_ssse3_direct(input_r[i]));
364.
365.         memcpy(res3, res2, 16);
366. #ifdef __AVX__
367.         _mm_storeu_si128((__m128i*)res3, deduplicate4_ssse3_bmi2(input_r[i]));
368. #endif
369.         uint32_t srt0[4], srt1[4], srt2[4], srt3[4];
370.         memcpy(srt0, res0, 16);
371.         memcpy(srt1, res1, 16);
372.         memcpy(srt2, res2, 16);
373.         memcpy(srt3, res3, 16);
374.         std::sort(srt0, srt0 + 4);
375.         std::sort(srt1, srt1 + 4);
376.         std::sort(srt2, srt2 + 4);
377.         std::sort(srt3, srt3 + 4);
378.         if (memcmp(srt0, srt1, 16) || memcmp(srt0, srt2, 16) || memcmp(srt0, srt3, 16)) {
379.             printf("%d %d %d %d\n", ptr[0], ptr[1], ptr[2], ptr[3]);
380.             printf("%d %d %d %d\n", res0[0], res0[1], res0[2], res0[3]);
381.             printf("%d %d %d %d\n", res1[0], res1[1], res1[2], res1[3]);
382.             printf("%d %d %d %d\n", res2[0], res2[1], res2[2], res2[3]);
383.             printf("%d %d %d %d\n", res3[0], res3[1], res3[2], res3[3]);
384.         }
385.     }
386.
387. #ifdef __AVX__
388.     {
389.         __m128i sum = _mm_setzero_si128();
390.         int start = clock();
391.         for (int i = 0; i < TRIES; i++) {
392.             __m128i res = deduplicate4_ssse3_bmi2(input_r[i & (CASES-1)]);
394.         }
395.         uint32_t tmp[4];
396.         _mm_storeu_si128((__m128i*)tmp, sum);
397.         printf("Time = %0.3lf   (%u)\n", double(clock() - start) / CLOCKS_PER_SEC, tmp[0] + tmp[1] + tmp[2] + tmp[3]);
398.     }
399. #endif
400.     {
401.         __m128i sum = _mm_setzero_si128();
402.         int start = clock();
403.         for (int i = 0; i < TRIES; i++) {
404.             __m128i res = deduplicate4_ssse3_direct(input_r[i & (CASES-1)]);
406.         }
407.         uint32_t tmp[4];
408.         _mm_storeu_si128((__m128i*)tmp, sum);
409.         printf("Time = %0.3lf   (%u)\n", double(clock() - start) / CLOCKS_PER_SEC, tmp[0] + tmp[1] + tmp[2] + tmp[3]);
410.     }
411.     {
412.         __m128i sum = _mm_setzero_si128();
413.         int start = clock();
414.         for (int i = 0; i < TRIES; i++) {
415.             __m128i res = deduplicate4_ssse3(input_r[i & (CASES-1)]);
417.         }
418.         uint32_t tmp[4];
419.         _mm_storeu_si128((__m128i*)tmp, sum);
420.         printf("Time = %0.3lf   (%u)\n", double(clock() - start) / CLOCKS_PER_SEC, tmp[0] + tmp[1] + tmp[2] + tmp[3]);
421.     }
422.     {
423.         uint32_t res[4];
424.         uint32_t sum = 0;
425.         int start = clock();
426.         for (int i = 0; i < TRIES; i++) {
427.             deduplicate4_trivial(input_i[i & (CASES-1)], res);
428.             sum += res[0] + res[1] + res[2] + res[3];
429.         }
430.         printf("Time = %0.3lf   (%u)\n", double(clock() - start) / CLOCKS_PER_SEC, sum);
431.     }
432.
433.     return 0;
434. }