Vectorizing linear search

#include <stdio.h>
#include <assert.h>
#include <time.h>
#include <algorithm>
#include <emmintrin.h>
#include <tmmintrin.h>  //_mm_hadd_epi32
#include <smmintrin.h>  //_mm_extract_epi32
#include <intrin.h>
#include <stdint.h>

//implementation by OP
static int linear_OP (const int *arr, int n, int key) {
    int i = 0;
    while (i < n) {
        if (arr [i] >= key)
            break;
        ++i;
    }
    return i;
}

//scalar implementation by stgatilov (that's me=))
static int linear_stgatilov_scalar (const int *arr, int n, int key) {
    int cnt = 0;
    for (int i = 0; i < n; i++)
        cnt += (arr[i] < key);
    return cnt;
}

//vectorized implementation by stgatilov (that's me again=))
static int linear_stgatilov_vec (const int *arr, int n, int key) {
    assert(size_t(arr) % 16 == 0);
    __m128i vkey = _mm_set1_epi32(key);
        __m128i cnt = _mm_setzero_si128();
        for (int i = 0; i < n; i += 16) {
            __m128i mask0 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[i+0]), vkey);
            __m128i mask1 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[i+4]), vkey);
            __m128i mask2 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[i+8]), vkey);
            __m128i mask3 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[i+12]), vkey);
            __m128i sum = _mm_add_epi32(_mm_add_epi32(mask0, mask1), _mm_add_epi32(mask2, mask3));
            cnt = _mm_sub_epi32(cnt, sum);
        }
        cnt = _mm_hadd_epi32(cnt, cnt);
        cnt = _mm_hadd_epi32(cnt, cnt);
//      int ans = _mm_extract_epi32(cnt, 0);    //SSE4.1
        int ans = _mm_extract_epi16(cnt, 0);    //correct only for n < 32K
        return ans;
}

//implementation by Paul R
static int linear_PaulR(const int32_t *A, int n, int32_t key)
{
#define VEC_INT_ELEMS 4
#define BLOCK_SIZE (VEC_INT_ELEMS * 32)
    const __m128i vkey = _mm_set1_epi32(key);
    int vresult = -1;
    int result = -1;
    int i, j;

    for (i = 0; i <= n - BLOCK_SIZE; i += BLOCK_SIZE)
    {
        __m128i vmask0 = _mm_set1_epi32(-1);
        __m128i vmask1 = _mm_set1_epi32(-1);
        int mask0, mask1;

        for (j = 0; j < BLOCK_SIZE; j += VEC_INT_ELEMS * 2)
        {
            __m128i vA0 = _mm_load_si128((__m128i *)&A[i + j]);
            __m128i vA1 = _mm_load_si128((__m128i *)&A[i + j + VEC_INT_ELEMS]);
            __m128i vcmp0 = _mm_cmpgt_epi32(vkey, vA0);
            __m128i vcmp1 = _mm_cmpgt_epi32(vkey, vA1);
            vmask0 = _mm_and_si128(vmask0, vcmp0);
            vmask1 = _mm_and_si128(vmask1, vcmp1);
        }
        mask0 = _mm_movemask_epi8(vmask0);
        mask1 = _mm_movemask_epi8(vmask1);
        if ((mask0 & mask1) != 0xffff)
        {
            vresult = i;
            break;
        }
    }
    if (vresult > -1)
    {
        result = vresult + linear_OP(&A[vresult], BLOCK_SIZE, key);
    }
    else if (i < n)
    {
        result = i + linear_OP(&A[i], n - i, key);
    }
    return result;
#undef BLOCK_SIZE
#undef VEC_INT_ELEMS
}

//Testing code below

static const int SIZE = 197;
int n = SIZE;
static union {
    int arr[SIZE + 16];
    __m128i align;
};
static const int TRIES = 1<<23;
static const int SAMPLES = 1<<10;
int tmp[SAMPLES];

int main() {
    for (int i = 0; i < n; i++)
        arr[i] = rand();
    std::sort(arr, arr+n);
    //Note: padding by maximal values is required!
    for (int i = n; i < (i+15)/16*16; i++)
        arr[i] = INT_MAX;

    for (int t = 0; t < TRIES/10; t++) {
        int q = rand();
        int ans = linear_OP(arr, n, q);
        int res1 = linear_PaulR(arr, n, q);
        int res2 = linear_stgatilov_scalar(arr, n, q);
        int res3 = linear_stgatilov_vec(arr, n, q);
        if (ans != res1 || ans != res2 || ans != res3)
            printf("error (%d): %d, %d, %d, %d\n", q, ans, res1, res2, res3);
    }

    for (int i = 0; i < SAMPLES; i++)
        tmp[i] = rand();

    {
        int start = clock();
        int check = 0;
        for (int t = 0; t < TRIES; t++) {
            int q = tmp[t & (SAMPLES-1)];
            int res = linear_OP(arr, n, q);
            check += res;
        }
        double elapsed = double(clock() - start) / CLOCKS_PER_SEC;
        printf("[OP]\n");
        printf("Time = %0.3lf (%d)\n", elapsed, check);
    }

    {
        int start = clock();
        int check = 0;
        for (int t = 0; t < TRIES; t++) {
            int q = tmp[t & (SAMPLES-1)];
            int res = linear_PaulR(arr, n, q);
            check += res;
        }
        double elapsed = double(clock() - start) / CLOCKS_PER_SEC;
        printf("[Paul R]\n");
        printf("Time = %0.3lf (%d)\n", elapsed, check);
    }

    {
        int start = clock();
        int check = 0;
        for (int t = 0; t < TRIES; t++) {
            int q = tmp[t & (SAMPLES-1)];
            int res = linear_stgatilov_vec(arr, n, q);
            check += res;
        }
        double elapsed = double(clock() - start) / CLOCKS_PER_SEC;
        printf("[stgatilov]\n");
        printf("Time = %0.3lf (%d)\n", elapsed, check);
    }

    return 0;
}