Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <assert.h>
- #include <time.h>
- #include <algorithm>
- #include <emmintrin.h>
- #include <tmmintrin.h> //_mm_hadd_epi32
- #include <smmintrin.h> //_mm_extract_epi32
- #include <intrin.h>
- #include <stdint.h>
- //implementation by OP
- static int linear_OP (const int *arr, int n, int key) {
- int i = 0;
- while (i < n) {
- if (arr [i] >= key)
- break;
- ++i;
- }
- return i;
- }
- //scalar implementation by stgatilov (that's me=))
- static int linear_stgatilov_scalar (const int *arr, int n, int key) {
- int cnt = 0;
- for (int i = 0; i < n; i++)
- cnt += (arr[i] < key);
- return cnt;
- }
- //vectorized implementation by stgatilov (that's me again=))
- static int linear_stgatilov_vec (const int *arr, int n, int key) {
- assert(size_t(arr) % 16 == 0);
- __m128i vkey = _mm_set1_epi32(key);
- __m128i cnt = _mm_setzero_si128();
- for (int i = 0; i < n; i += 16) {
- __m128i mask0 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[i+0]), vkey);
- __m128i mask1 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[i+4]), vkey);
- __m128i mask2 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[i+8]), vkey);
- __m128i mask3 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[i+12]), vkey);
- __m128i sum = _mm_add_epi32(_mm_add_epi32(mask0, mask1), _mm_add_epi32(mask2, mask3));
- cnt = _mm_sub_epi32(cnt, sum);
- }
- cnt = _mm_hadd_epi32(cnt, cnt);
- cnt = _mm_hadd_epi32(cnt, cnt);
- // int ans = _mm_extract_epi32(cnt, 0); //SSE4.1
- int ans = _mm_extract_epi16(cnt, 0); //correct only for n < 32K
- return ans;
- }
- //implementation by Paul R
- static int linear_PaulR(const int32_t *A, int n, int32_t key)
- {
- #define VEC_INT_ELEMS 4
- #define BLOCK_SIZE (VEC_INT_ELEMS * 32)
- const __m128i vkey = _mm_set1_epi32(key);
- int vresult = -1;
- int result = -1;
- int i, j;
- for (i = 0; i <= n - BLOCK_SIZE; i += BLOCK_SIZE)
- {
- __m128i vmask0 = _mm_set1_epi32(-1);
- __m128i vmask1 = _mm_set1_epi32(-1);
- int mask0, mask1;
- for (j = 0; j < BLOCK_SIZE; j += VEC_INT_ELEMS * 2)
- {
- __m128i vA0 = _mm_load_si128((__m128i *)&A[i + j]);
- __m128i vA1 = _mm_load_si128((__m128i *)&A[i + j + VEC_INT_ELEMS]);
- __m128i vcmp0 = _mm_cmpgt_epi32(vkey, vA0);
- __m128i vcmp1 = _mm_cmpgt_epi32(vkey, vA1);
- vmask0 = _mm_and_si128(vmask0, vcmp0);
- vmask1 = _mm_and_si128(vmask1, vcmp1);
- }
- mask0 = _mm_movemask_epi8(vmask0);
- mask1 = _mm_movemask_epi8(vmask1);
- if ((mask0 & mask1) != 0xffff)
- {
- vresult = i;
- break;
- }
- }
- if (vresult > -1)
- {
- result = vresult + linear_OP(&A[vresult], BLOCK_SIZE, key);
- }
- else if (i < n)
- {
- result = i + linear_OP(&A[i], n - i, key);
- }
- return result;
- #undef BLOCK_SIZE
- #undef VEC_INT_ELEMS
- }
- //Testing code below
- static const int SIZE = 197;
- int n = SIZE;
- static union {
- int arr[SIZE + 16];
- __m128i align;
- };
- static const int TRIES = 1<<23;
- static const int SAMPLES = 1<<10;
- int tmp[SAMPLES];
- int main() {
- for (int i = 0; i < n; i++)
- arr[i] = rand();
- std::sort(arr, arr+n);
- //Note: padding by maximal values is required!
- for (int i = n; i < (i+15)/16*16; i++)
- arr[i] = INT_MAX;
- for (int t = 0; t < TRIES/10; t++) {
- int q = rand();
- int ans = linear_OP(arr, n, q);
- int res1 = linear_PaulR(arr, n, q);
- int res2 = linear_stgatilov_scalar(arr, n, q);
- int res3 = linear_stgatilov_vec(arr, n, q);
- if (ans != res1 || ans != res2 || ans != res3)
- printf("error (%d): %d, %d, %d, %d\n", q, ans, res1, res2, res3);
- }
- for (int i = 0; i < SAMPLES; i++)
- tmp[i] = rand();
- {
- int start = clock();
- int check = 0;
- for (int t = 0; t < TRIES; t++) {
- int q = tmp[t & (SAMPLES-1)];
- int res = linear_OP(arr, n, q);
- check += res;
- }
- double elapsed = double(clock() - start) / CLOCKS_PER_SEC;
- printf("[OP]\n");
- printf("Time = %0.3lf (%d)\n", elapsed, check);
- }
- {
- int start = clock();
- int check = 0;
- for (int t = 0; t < TRIES; t++) {
- int q = tmp[t & (SAMPLES-1)];
- int res = linear_PaulR(arr, n, q);
- check += res;
- }
- double elapsed = double(clock() - start) / CLOCKS_PER_SEC;
- printf("[Paul R]\n");
- printf("Time = %0.3lf (%d)\n", elapsed, check);
- }
- {
- int start = clock();
- int check = 0;
- for (int t = 0; t < TRIES; t++) {
- int q = tmp[t & (SAMPLES-1)];
- int res = linear_stgatilov_vec(arr, n, q);
- check += res;
- }
- double elapsed = double(clock() - start) / CLOCKS_PER_SEC;
- printf("[stgatilov]\n");
- printf("Time = %0.3lf (%d)\n", elapsed, check);
- }
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement