Test code and result for MVTools Overlaps

Test code:

#include <emmintrin.h>
#include <immintrin.h>
#include <avx2intrin.h>
#include <cstdint>
#include <random>
#include <cstdio>
#include <intrin.h>

#include "../iaca-win64/iacaMarks.h"

typedef uint32_t PixelType2;
typedef uint16_t PixelType;

template <int blockWidth, int blockHeight>
void Overlaps_C(uint8_t *pDst8, intptr_t nDstPitch, const uint8_t *pSrc8, intptr_t nSrcPitch, short *pWin, intptr_t nWinPitch)
{
    // pWin from 0 to 2048
    for (int j=0; j<blockHeight; j++)
    {
        PixelType2 *pDst = (PixelType2 *)pDst8;
        PixelType const *pSrc = (PixelType const *)pSrc8;

        for (int i=0; i<blockWidth; i++)
        {
            pDst[i] += ((pSrc[i] * pWin[i]) >> 6);
        }

        pDst8 += nDstPitch;
        pSrc8 += nSrcPitch;
        pWin += nWinPitch;
    }
}

template <int blockWidth, int blockHeight>
void Overlaps_8to32xX_AVX2(uint8_t *pDst8, intptr_t nDstPitch, const uint8_t *pSrc8, intptr_t nSrcPitch, int16_t *pWin, intptr_t nWinPitch)
{
    // pWin from 0 to 2048
    for (int j = 0; j < blockHeight; j++)
    {
        uint32_t *pDst = (uint32_t *)pDst8;
        uint16_t const *pSrc = (uint16_t const *)pSrc8;

        for (int i = 0; i < blockWidth; i += 8)
        {
            __m256i src = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(pSrc + i)));
            __m256i win = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(pWin + i)));
            __m256i dst = _mm256_loadu_si256((__m256i*)(pDst + i));

            _mm256_storeu_si256((__m256i *)(pDst + i), _mm256_add_epi32(dst, _mm256_srli_epi32(_mm256_mullo_epi32(src, win), 6)));
        }

        pDst8 += nDstPitch;
        pSrc8 += nSrcPitch;
        pWin += nWinPitch;
    }
}

template <int blockWidth, int blockHeight>
void Overlaps_4to32xX_SSE41(uint8_t *pDst8, intptr_t nDstPitch, const uint8_t *pSrc8, intptr_t nSrcPitch, int16_t *pWin, intptr_t nWinPitch)
{
    // pWin from 0 to 2048
    for (int j = 0; j < blockHeight; j++)
    {
        uint32_t *pDst = (uint32_t *)pDst8;
        uint16_t const *pSrc = (uint16_t const *)pSrc8;

        for (int i = 0; i < blockWidth; i += 4)
        {
            __m128i src = _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)(pSrc + i)));
            __m128i win = _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)(pWin + i)));
            __m128i dst = _mm_loadu_si128((__m128i*)(pDst + i));

            _mm_storeu_si128((__m128i *)(pDst + i), _mm_add_epi32(dst, _mm_srli_epi32(_mm_mullo_epi32(src, win), 6)));
        }

        pDst8 += nDstPitch;
        pSrc8 += nSrcPitch;
        pWin += nWinPitch;
    }
}

template <int blockWidth, int blockHeight>
void Overlaps_4to32xX_SSE2(uint8_t *pDst8, intptr_t nDstPitch, const uint8_t *pSrc8, intptr_t nSrcPitch, int16_t *pWin, intptr_t nWinPitch)
{
    // pWin from 0 to 2048
    for (int j = 0; j < blockHeight; j++)
    {
        uint32_t *pDst = (uint32_t *)pDst8;
        uint16_t const *pSrc = (uint16_t const *)pSrc8;

        for (int i = 0; i < blockWidth; i += 4)
        {
            __m128i src = _mm_setr_epi32((int)(pSrc[i + 0]), (int)(pSrc[i + 1]), (int)(pSrc[i + 2]), (int)(pSrc[i + 3]));
            __m128i win = _mm_setr_epi32((int)(pWin[i + 0]), (int)(pWin[i + 1]), (int)(pWin[i + 2]), (int)(pWin[i + 3]));
            __m128i dst = _mm_loadu_si128((__m128i*)(pDst + i));

            _mm_storeu_si128((__m128i *)(pDst + i), _mm_add_epi32(dst, _mm_srli_epi32(_mm_mullo_epi32(src, win), 6)));
        }

        pDst8 += nDstPitch;
        pSrc8 += nSrcPitch;
        pWin += nWinPitch;
    }
}

intptr_t const MAX = 65536;
uint16_t const ITER = 2048;

using namespace std;

int main()
{
    uint32_t *dst = new uint32_t[MAX];
    int16_t  *win = new  int16_t[MAX];
    uint16_t *src = new uint16_t[MAX];
    uint32_t *sse = new uint32_t[MAX];


    uint64_t tot_c = 0, tot_avx = 0, tot_sse = 0, tot_s41 = 0;
    uint64_t start, end;

    for(uint64_t i = 0; i < MAX; i++)
        src[i] = i;

    for(uint16_t iter = 1; iter < ITER; iter++)
    {
        for(uint64_t i = 0; i < MAX; i++)
            win[i] = iter;

        start = __rdtsc();
        Overlaps_C<MAX, 1>((uint8_t*)dst, MAX, (uint8_t*)src, MAX, win, MAX);
        end = __rdtsc();
        tot_c += end - start;

        start = __rdtsc();
        Overlaps_4to32xX_SSE2<MAX, 1>((uint8_t*)sse, MAX, (uint8_t*)src, MAX, win, MAX);
        end = __rdtsc();
        tot_sse += end - start;

        start = __rdtsc();
        Overlaps_8to32xX_AVX2<MAX, 1>((uint8_t*)sse, MAX, (uint8_t*)src, MAX, win, MAX);
        end = __rdtsc();
        tot_avx += end - start;

        start = __rdtsc();
        Overlaps_4to32xX_SSE41<MAX, 1>((uint8_t*)sse, MAX, (uint8_t*)src, MAX, win, MAX);
        end = __rdtsc();
        tot_s41 += end - start;


        printf("cicli c  : %20llu\n"
               "cicli avx: %20llu\n"
               "cicli sse: %20llu\n"
               "cicli s41: %20llu\n"
               "diffenza c/avx: %f\n"
               "diffenza c/sse: %f\n"
               "diffenza c/s41: %f\n"
               "diffenza sse/avx: %f\n",
               tot_c, tot_avx, tot_sse, tot_s41,
               (float)tot_c/(float)tot_avx, (float)tot_c/(float)tot_sse, (float)tot_c/(float)tot_s41, (float)tot_sse/(float)tot_avx);

        tot_c = tot_avx = tot_sse = tot_s41 = 0;


    }
    system("pause"