Advertisement
MonoS

Test code and result for MVTools Overlaps

Dec 13th, 2015
87
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 26.75 KB | None | 0 0
  1. Test code:
  2.  
  3. #include <emmintrin.h>
  4. #include <immintrin.h>
  5. #include <avx2intrin.h>
  6. #include <cstdint>
  7. #include <random>
  8. #include <cstdio>
  9. #include <intrin.h>
  10.  
  11. #include "../iaca-win64/iacaMarks.h"
  12.  
  13. typedef uint32_t PixelType2;
  14. typedef uint16_t PixelType;
  15.  
  16. template <int blockWidth, int blockHeight>
  17. void Overlaps_C(uint8_t *pDst8, intptr_t nDstPitch, const uint8_t *pSrc8, intptr_t nSrcPitch, short *pWin, intptr_t nWinPitch)
  18. {
  19.     // pWin from 0 to 2048
  20.     for (int j=0; j<blockHeight; j++)
  21.     {
  22.         PixelType2 *pDst = (PixelType2 *)pDst8;
  23.         PixelType const *pSrc = (PixelType const *)pSrc8;
  24.  
  25.         for (int i=0; i<blockWidth; i++)
  26.         {
  27.             pDst[i] += ((pSrc[i] * pWin[i]) >> 6);
  28.         }
  29.  
  30.         pDst8 += nDstPitch;
  31.         pSrc8 += nSrcPitch;
  32.         pWin += nWinPitch;
  33.     }
  34. }
  35.  
  36. template <int blockWidth, int blockHeight>
  37. void Overlaps_8to32xX_AVX2(uint8_t *pDst8, intptr_t nDstPitch, const uint8_t *pSrc8, intptr_t nSrcPitch, int16_t *pWin, intptr_t nWinPitch)
  38. {
  39.     // pWin from 0 to 2048
  40.     for (int j = 0; j < blockHeight; j++)
  41.     {
  42.         uint32_t *pDst = (uint32_t *)pDst8;
  43.         uint16_t const *pSrc = (uint16_t const *)pSrc8;
  44.  
  45.         for (int i = 0; i < blockWidth; i += 8)
  46.         {
  47.             __m256i src = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(pSrc + i)));
  48.             __m256i win = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(pWin + i)));
  49.             __m256i dst = _mm256_loadu_si256((__m256i*)(pDst + i));
  50.  
  51.             _mm256_storeu_si256((__m256i *)(pDst + i), _mm256_add_epi32(dst, _mm256_srli_epi32(_mm256_mullo_epi32(src, win), 6)));
  52.         }
  53.  
  54.         pDst8 += nDstPitch;
  55.         pSrc8 += nSrcPitch;
  56.         pWin += nWinPitch;
  57.     }
  58. }
  59.  
  60. template <int blockWidth, int blockHeight>
  61. void Overlaps_4to32xX_SSE41(uint8_t *pDst8, intptr_t nDstPitch, const uint8_t *pSrc8, intptr_t nSrcPitch, int16_t *pWin, intptr_t nWinPitch)
  62. {
  63.     // pWin from 0 to 2048
  64.     for (int j = 0; j < blockHeight; j++)
  65.     {
  66.         uint32_t *pDst = (uint32_t *)pDst8;
  67.         uint16_t const *pSrc = (uint16_t const *)pSrc8;
  68.  
  69.         for (int i = 0; i < blockWidth; i += 4)
  70.         {
  71.             __m128i src = _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)(pSrc + i)));
  72.             __m128i win = _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)(pWin + i)));
  73.             __m128i dst = _mm_loadu_si128((__m128i*)(pDst + i));
  74.  
  75.             _mm_storeu_si128((__m128i *)(pDst + i), _mm_add_epi32(dst, _mm_srli_epi32(_mm_mullo_epi32(src, win), 6)));
  76.         }
  77.  
  78.         pDst8 += nDstPitch;
  79.         pSrc8 += nSrcPitch;
  80.         pWin += nWinPitch;
  81.     }
  82. }
  83.  
  84. template <int blockWidth, int blockHeight>
  85. void Overlaps_4to32xX_SSE2(uint8_t *pDst8, intptr_t nDstPitch, const uint8_t *pSrc8, intptr_t nSrcPitch, int16_t *pWin, intptr_t nWinPitch)
  86. {
  87.     // pWin from 0 to 2048
  88.     for (int j = 0; j < blockHeight; j++)
  89.     {
  90.         uint32_t *pDst = (uint32_t *)pDst8;
  91.         uint16_t const *pSrc = (uint16_t const *)pSrc8;
  92.  
  93.         for (int i = 0; i < blockWidth; i += 4)
  94.         {
  95.             __m128i src = _mm_setr_epi32((int)(pSrc[i + 0]), (int)(pSrc[i + 1]), (int)(pSrc[i + 2]), (int)(pSrc[i + 3]));
  96.             __m128i win = _mm_setr_epi32((int)(pWin[i + 0]), (int)(pWin[i + 1]), (int)(pWin[i + 2]), (int)(pWin[i + 3]));
  97.             __m128i dst = _mm_loadu_si128((__m128i*)(pDst + i));
  98.  
  99.             _mm_storeu_si128((__m128i *)(pDst + i), _mm_add_epi32(dst, _mm_srli_epi32(_mm_mullo_epi32(src, win), 6)));
  100.         }
  101.  
  102.         pDst8 += nDstPitch;
  103.         pSrc8 += nSrcPitch;
  104.         pWin += nWinPitch;
  105.     }
  106. }
  107.  
  108. intptr_t const MAX = 65536;
  109. uint16_t const ITER = 2048;
  110.  
  111. using namespace std;
  112.  
  113. int main()
  114. {
  115.     uint32_t *dst = new uint32_t[MAX];
  116.     int16_t  *win = new  int16_t[MAX];
  117.     uint16_t *src = new uint16_t[MAX];
  118.     uint32_t *sse = new uint32_t[MAX];
  119.  
  120.  
  121.     uint64_t tot_c = 0, tot_avx = 0, tot_sse = 0, tot_s41 = 0;
  122.     uint64_t start, end;
  123.  
  124.     for(uint64_t i = 0; i < MAX; i++)
  125.         src[i] = i;
  126.  
  127.     for(uint16_t iter = 1; iter < ITER; iter++)
  128.     {
  129.         for(uint64_t i = 0; i < MAX; i++)
  130.             win[i] = iter;
  131.  
  132.         start = __rdtsc();
  133.         Overlaps_C<MAX, 1>((uint8_t*)dst, MAX, (uint8_t*)src, MAX, win, MAX);
  134.         end = __rdtsc();
  135.         tot_c += end - start;
  136.  
  137.         start = __rdtsc();
  138.         Overlaps_4to32xX_SSE2<MAX, 1>((uint8_t*)sse, MAX, (uint8_t*)src, MAX, win, MAX);
  139.         end = __rdtsc();
  140.         tot_sse += end - start;
  141.  
  142.         start = __rdtsc();
  143.         Overlaps_8to32xX_AVX2<MAX, 1>((uint8_t*)sse, MAX, (uint8_t*)src, MAX, win, MAX);
  144.         end = __rdtsc();
  145.         tot_avx += end - start;
  146.  
  147.         start = __rdtsc();
  148.         Overlaps_4to32xX_SSE41<MAX, 1>((uint8_t*)sse, MAX, (uint8_t*)src, MAX, win, MAX);
  149.         end = __rdtsc();
  150.         tot_s41 += end - start;
  151.  
  152.  
  153.         printf("cicli c  : %20llu\n"
  154.                "cicli avx: %20llu\n"
  155.                "cicli sse: %20llu\n"
  156.                "cicli s41: %20llu\n"
  157.                "diffenza c/avx: %f\n"
  158.                "diffenza c/sse: %f\n"
  159.                "diffenza c/s41: %f\n"
  160.                "diffenza sse/avx: %f\n",
  161.                tot_c, tot_avx, tot_sse, tot_s41,
  162.                (float)tot_c/(float)tot_avx, (float)tot_c/(float)tot_sse, (float)tot_c/(float)tot_s41, (float)tot_sse/(float)tot_avx);
  163.  
  164.         tot_c = tot_avx = tot_sse = tot_s41 = 0;
  165.  
  166.  
  167.     }
  168.     system("pause"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement