Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <stdint.h>
- #include <assert.h>
- #include <time.h>
- #include <string.h>
- #include <vector>
- #include <algorithm>
- #include <tmmintrin.h> //SSSE3 is required
- #ifdef __AVX__
- #include <immintrin.h>
- #endif
- #ifdef _MSC_VER
- #define ALIGN(n) __declspec(align(n))
- #else
- #define ALIGN(n) __attribute__((aligned(n)))
- #endif
- //============================ Solutions ==============================
- #define MIN(a, b) ((a) < (b) ? (a) : (b))
- #define MAX(a, b) ((a) > (b) ? (a) : (b))
- static inline void deduplicate4_trivial(const uint32_t *arr, uint32_t *res) {
- res[0] = arr[0];
- res[1] = arr[1];
- res[2] = arr[2];
- res[3] = arr[3];
- //sorting network for 4 numbers, taken from https://en.wikipedia.org/wiki/Sorting_network
- #define CMP(i, j) {\
- uint32_t a = MIN(res[i], res[j]);\
- uint32_t b = MAX(res[i], res[j]);\
- res[i] = a;\
- res[j] = b;\
- }
- CMP(0, 2);
- CMP(1, 3);
- CMP(0, 1);
- CMP(2, 3);
- CMP(1, 2);
- //remove consecutive duplicates
- size_t pos = 1;
- if (res[0] != res[1])
- res[pos++] = res[1];
- if (res[1] != res[2])
- res[pos++] = res[2];
- if (res[2] != res[3])
- res[pos++] = res[3];
- //fill rest with zeros
- while (pos < 4)
- res[pos++] = 0;
- }
- ALIGN(16) unsigned char lookup_bmi_chars[64][16] = { //1Kb size
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F},
- {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}
- };
- static __m128i *const lookup_bmi = (__m128i*) &lookup_bmi_chars[0][0];
- #ifdef __AVX__
- static inline __m128i deduplicate4_ssse3_bmi2(__m128i abcd) {
- __m128i bcda = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(0, 3, 2, 1));
- __m128i cdab = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(1, 0, 3, 2));
- unsigned int mask1 = _mm_movemask_epi8(_mm_cmpeq_epi32(abcd, bcda));
- unsigned int mask2 = _mm_movemask_epi8(_mm_cmpeq_epi32(abcd, cdab));
- unsigned int maskFull = (mask2 << 16U) + mask1;
- //Note: BMI2 instruction used here!
- unsigned int lutIndex = _pext_u32(maskFull, (1<<0) | (1<<4) | (1<<8) | (1<<12) | (1<<16) | (1<<20));
- __m128i shuf = lookup_bmi[lutIndex];
- return _mm_shuffle_epi8(abcd, shuf);
- }
- #endif
- static const int HASH_BITS = 6;
- ALIGN(16) unsigned char lookup_hash_chars[1 << HASH_BITS][16] = { //1Kb size
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F},
- {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
- {0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}
- };
- static __m128i *const lookup_hash = (__m128i*) &lookup_hash_chars[0][0];
- static inline __m128i deduplicate4_ssse3(__m128i abcd) {
- __m128i bcda = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(0, 3, 2, 1));
- __m128i cdab = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(1, 0, 3, 2));
- uint32_t mask1 = _mm_movemask_epi8(_mm_cmpeq_epi32(abcd, bcda));
- uint32_t mask2 = _mm_movemask_epi8(_mm_cmpeq_epi32(abcd, cdab));
- uint32_t maskFull = (mask2 << 16U) + mask1;
- //Note: minimal perfect hash function here
- uint32_t lutIndex = (maskFull * 0X0044CCCEU) >> 26U;
- __m128i shuf = lookup_hash[lutIndex];
- return _mm_shuffle_epi8(abcd, shuf);
- }
- static __m128i *const lookup_direct = (__m128i*) &lookup_bmi_chars[0][0];
- static __m128i *const lookup_direct_offset = lookup_direct - 0xC0U;
- static inline __m128i deduplicate4_ssse3_direct(__m128i abcd) {
- __m128i bcda = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(0, 3, 2, 1));
- __m128i cdcd = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(3, 2, 3, 2));
- uint32_t mask1 = _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(abcd, bcda)));
- uint32_t mask2 = _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(abcd, cdcd)));
- uint32_t maskFull = 16U * mask2 + mask1;
- //Note: use index directly
- uint32_t lutIndex = maskFull;
- __m128i shuf = lookup_direct_offset[lutIndex];
- return _mm_shuffle_epi8(abcd, shuf);
- }
- //======================== Perfect hash generator =========================
- typedef uint32_t hash_int;
- //searching for a hash function of this kind:
- hash_int HashFunction(hash_int param, hash_int x, int bits) {
- return (param * x) >> hash_int(32 - bits);
- }
- static const hash_int NO_PARAM = hash_int(-1);
- hash_int GeneratePerfectHash(std::vector<hash_int> values, int bits, hash_int tries = NO_PARAM) {
- std::vector<int> table;
- for (hash_int param = 0; param < tries; param++) {
- table.assign(hash_int(1)<<bits, 0);
- bool ok = true;
- for (int i = 0; ok && i < values.size(); i++) {
- hash_int x = values[i];
- hash_int cell = HashFunction(param, x, bits);
- if (table[cell]++)
- ok = false;
- }
- if (ok)
- return param;
- }
- return NO_PARAM;
- }
- //======================== Lookup table generator =========================
- void PrintTable(const __m128i *lut, int size) {
- for (int i = 0; i < size; i++) {
- const unsigned char *entry = (unsigned char*) &lut[i];
- printf("\t{");
- for (int j = 0; j < 16; j++) {
- if (j) printf(", ");
- printf("0x%02X", (unsigned int)(entry[j]));
- }
- printf("},\n");
- }
- printf("\n");
- }
- void GenerateLookupTable() {
- for (int minMask = 0; minMask < 64; minMask++) {
- bool matr[4][4];
- matr[0][1] = (minMask >> 0) & 1;
- matr[1][2] = (minMask >> 1) & 1;
- matr[2][3] = (minMask >> 2) & 1;
- matr[0][3] = (minMask >> 3) & 1;
- matr[0][2] = (minMask >> 4) & 1;
- matr[1][3] = (minMask >> 5) & 1;
- int order[4] = {-1, -1, -1, -1};
- int pos = 0;
- for (int i = 0; i < 4; i++) {
- bool isNew = true;
- for (int j = 0; j < i; j++)
- if (matr[j][i])
- isNew = false;
- if (isNew)
- order[pos++] = i;
- }
- unsigned char *bytes = lookup_bmi_chars[minMask];
- for (int i = 0; i < 4; i++)
- for (int j = 0; j < 4; j++)
- bytes[4 * i + j] = (order[i] < 0 ? -1 : 4 * order[i] + j);
- }
- }
- uint32_t GenerateLookupTableHash() {
- GenerateLookupTable();
- std::vector<uint32_t> allMasks;
- for (int minMask = 0; minMask < 64; minMask++) {
- uint32_t fullMask = 0;
- for (int i = 0; i < 6; i++)
- if ((minMask >> i) & 1) {
- fullMask |= 0xFU << (4*i);
- if (i >= 4) //comparisons 4 and 6, 5 and 7 are same
- fullMask |= 0xFU << (4*(i+2));
- }
- allMasks.push_back(fullMask);
- }
- uint32_t param = GeneratePerfectHash(allMasks, HASH_BITS);
- assert(param != NO_PARAM);
- memset(lookup_hash, -1, sizeof(lookup_hash));
- for (int i = 0; i < allMasks.size(); i++) {
- uint32_t hash = HashFunction(param, allMasks[i], HASH_BITS);
- lookup_hash[hash] = lookup_bmi[i];
- }
- return param;
- }
- void GenAll() {
- GenerateLookupTable();
- printf("lookup_bmi:\n");
- PrintTable(lookup_bmi, 64);
- uint32_t param = GenerateLookupTableHash();
- printf("lookup_hash (%08X):\n", param);
- PrintTable(lookup_hash, 1 << HASH_BITS);
- }
- //=============================== Testing =================================
- static const int CASES = 1<<14;
- static const int TRIES = 1<<29;
- static union {
- uint32_t input_i[CASES][4];
- __m128i input_r[CASES];
- };
- int main() {
- /* GenAll();
- return 0;*/
- for (int i = 0; i < CASES; i++) {
- for (int j = 0; j < 4; j++)
- input_i[i][j] = rand() * 13457U + rand();
- int eq = rand() % 5;
- for (int c = 0; c < eq; c++) {
- int a = rand() % 4, b = rand() % 4;
- input_i[i][a] = input_i[i][b];
- }
- }
- for (int i = 0; i < CASES; i++) {
- uint32_t res0[4], res1[4], res2[4], res3[4];
- uint32_t *ptr = input_i[i];
- deduplicate4_trivial(ptr, res0);
- _mm_storeu_si128((__m128i*)res1, deduplicate4_ssse3(input_r[i]));
- _mm_storeu_si128((__m128i*)res2, deduplicate4_ssse3_direct(input_r[i]));
- memcpy(res3, res2, 16);
- #ifdef __AVX__
- _mm_storeu_si128((__m128i*)res3, deduplicate4_ssse3_bmi2(input_r[i]));
- #endif
- uint32_t srt0[4], srt1[4], srt2[4], srt3[4];
- memcpy(srt0, res0, 16);
- memcpy(srt1, res1, 16);
- memcpy(srt2, res2, 16);
- memcpy(srt3, res3, 16);
- std::sort(srt0, srt0 + 4);
- std::sort(srt1, srt1 + 4);
- std::sort(srt2, srt2 + 4);
- std::sort(srt3, srt3 + 4);
- if (memcmp(srt0, srt1, 16) || memcmp(srt0, srt2, 16) || memcmp(srt0, srt3, 16)) {
- printf("%d %d %d %d\n", ptr[0], ptr[1], ptr[2], ptr[3]);
- printf("%d %d %d %d\n", res0[0], res0[1], res0[2], res0[3]);
- printf("%d %d %d %d\n", res1[0], res1[1], res1[2], res1[3]);
- printf("%d %d %d %d\n", res2[0], res2[1], res2[2], res2[3]);
- printf("%d %d %d %d\n", res3[0], res3[1], res3[2], res3[3]);
- }
- }
- #ifdef __AVX__
- {
- __m128i sum = _mm_setzero_si128();
- int start = clock();
- for (int i = 0; i < TRIES; i++) {
- __m128i res = deduplicate4_ssse3_bmi2(input_r[i & (CASES-1)]);
- sum = _mm_add_epi32(sum, res);
- }
- uint32_t tmp[4];
- _mm_storeu_si128((__m128i*)tmp, sum);
- printf("Time = %0.3lf (%u)\n", double(clock() - start) / CLOCKS_PER_SEC, tmp[0] + tmp[1] + tmp[2] + tmp[3]);
- }
- #endif
- {
- __m128i sum = _mm_setzero_si128();
- int start = clock();
- for (int i = 0; i < TRIES; i++) {
- __m128i res = deduplicate4_ssse3_direct(input_r[i & (CASES-1)]);
- sum = _mm_add_epi32(sum, res);
- }
- uint32_t tmp[4];
- _mm_storeu_si128((__m128i*)tmp, sum);
- printf("Time = %0.3lf (%u)\n", double(clock() - start) / CLOCKS_PER_SEC, tmp[0] + tmp[1] + tmp[2] + tmp[3]);
- }
- {
- __m128i sum = _mm_setzero_si128();
- int start = clock();
- for (int i = 0; i < TRIES; i++) {
- __m128i res = deduplicate4_ssse3(input_r[i & (CASES-1)]);
- sum = _mm_add_epi32(sum, res);
- }
- uint32_t tmp[4];
- _mm_storeu_si128((__m128i*)tmp, sum);
- printf("Time = %0.3lf (%u)\n", double(clock() - start) / CLOCKS_PER_SEC, tmp[0] + tmp[1] + tmp[2] + tmp[3]);
- }
- {
- uint32_t res[4];
- uint32_t sum = 0;
- int start = clock();
- for (int i = 0; i < TRIES; i++) {
- deduplicate4_trivial(input_i[i & (CASES-1)], res);
- sum += res[0] + res[1] + res[2] + res[3];
- }
- printf("Time = %0.3lf (%u)\n", double(clock() - start) / CLOCKS_PER_SEC, sum);
- }
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement