Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Test code:
- #include <emmintrin.h>
- #include <immintrin.h>
- #include <avx2intrin.h>
- #include <cstdint>
- #include <random>
- #include <cstdio>
- #include <intrin.h>
- #include "../iaca-win64/iacaMarks.h"
- typedef uint32_t PixelType2;
- typedef uint16_t PixelType;
- template <int blockWidth, int blockHeight>
- void Overlaps_C(uint8_t *pDst8, intptr_t nDstPitch, const uint8_t *pSrc8, intptr_t nSrcPitch, short *pWin, intptr_t nWinPitch)
- {
- // pWin from 0 to 2048
- for (int j=0; j<blockHeight; j++)
- {
- PixelType2 *pDst = (PixelType2 *)pDst8;
- PixelType const *pSrc = (PixelType const *)pSrc8;
- for (int i=0; i<blockWidth; i++)
- {
- pDst[i] += ((pSrc[i] * pWin[i]) >> 6);
- }
- pDst8 += nDstPitch;
- pSrc8 += nSrcPitch;
- pWin += nWinPitch;
- }
- }
- template <int blockWidth, int blockHeight>
- void Overlaps_8to32xX_AVX2(uint8_t *pDst8, intptr_t nDstPitch, const uint8_t *pSrc8, intptr_t nSrcPitch, int16_t *pWin, intptr_t nWinPitch)
- {
- // pWin from 0 to 2048
- for (int j = 0; j < blockHeight; j++)
- {
- uint32_t *pDst = (uint32_t *)pDst8;
- uint16_t const *pSrc = (uint16_t const *)pSrc8;
- for (int i = 0; i < blockWidth; i += 8)
- {
- __m256i src = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(pSrc + i)));
- __m256i win = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(pWin + i)));
- __m256i dst = _mm256_loadu_si256((__m256i*)(pDst + i));
- _mm256_storeu_si256((__m256i *)(pDst + i), _mm256_add_epi32(dst, _mm256_srli_epi32(_mm256_mullo_epi32(src, win), 6)));
- }
- pDst8 += nDstPitch;
- pSrc8 += nSrcPitch;
- pWin += nWinPitch;
- }
- }
- template <int blockWidth, int blockHeight>
- void Overlaps_4to32xX_SSE41(uint8_t *pDst8, intptr_t nDstPitch, const uint8_t *pSrc8, intptr_t nSrcPitch, int16_t *pWin, intptr_t nWinPitch)
- {
- // pWin from 0 to 2048
- for (int j = 0; j < blockHeight; j++)
- {
- uint32_t *pDst = (uint32_t *)pDst8;
- uint16_t const *pSrc = (uint16_t const *)pSrc8;
- for (int i = 0; i < blockWidth; i += 4)
- {
- __m128i src = _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)(pSrc + i)));
- __m128i win = _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)(pWin + i)));
- __m128i dst = _mm_loadu_si128((__m128i*)(pDst + i));
- _mm_storeu_si128((__m128i *)(pDst + i), _mm_add_epi32(dst, _mm_srli_epi32(_mm_mullo_epi32(src, win), 6)));
- }
- pDst8 += nDstPitch;
- pSrc8 += nSrcPitch;
- pWin += nWinPitch;
- }
- }
- template <int blockWidth, int blockHeight>
- void Overlaps_4to32xX_SSE2(uint8_t *pDst8, intptr_t nDstPitch, const uint8_t *pSrc8, intptr_t nSrcPitch, int16_t *pWin, intptr_t nWinPitch)
- {
- // pWin from 0 to 2048
- for (int j = 0; j < blockHeight; j++)
- {
- uint32_t *pDst = (uint32_t *)pDst8;
- uint16_t const *pSrc = (uint16_t const *)pSrc8;
- for (int i = 0; i < blockWidth; i += 4)
- {
- __m128i src = _mm_setr_epi32((int)(pSrc[i + 0]), (int)(pSrc[i + 1]), (int)(pSrc[i + 2]), (int)(pSrc[i + 3]));
- __m128i win = _mm_setr_epi32((int)(pWin[i + 0]), (int)(pWin[i + 1]), (int)(pWin[i + 2]), (int)(pWin[i + 3]));
- __m128i dst = _mm_loadu_si128((__m128i*)(pDst + i));
- _mm_storeu_si128((__m128i *)(pDst + i), _mm_add_epi32(dst, _mm_srli_epi32(_mm_mullo_epi32(src, win), 6)));
- }
- pDst8 += nDstPitch;
- pSrc8 += nSrcPitch;
- pWin += nWinPitch;
- }
- }
- intptr_t const MAX = 65536;
- uint16_t const ITER = 2048;
- using namespace std;
- int main()
- {
- uint32_t *dst = new uint32_t[MAX];
- int16_t *win = new int16_t[MAX];
- uint16_t *src = new uint16_t[MAX];
- uint32_t *sse = new uint32_t[MAX];
- uint64_t tot_c = 0, tot_avx = 0, tot_sse = 0, tot_s41 = 0;
- uint64_t start, end;
- for(uint64_t i = 0; i < MAX; i++)
- src[i] = i;
- for(uint16_t iter = 1; iter < ITER; iter++)
- {
- for(uint64_t i = 0; i < MAX; i++)
- win[i] = iter;
- start = __rdtsc();
- Overlaps_C<MAX, 1>((uint8_t*)dst, MAX, (uint8_t*)src, MAX, win, MAX);
- end = __rdtsc();
- tot_c += end - start;
- start = __rdtsc();
- Overlaps_4to32xX_SSE2<MAX, 1>((uint8_t*)sse, MAX, (uint8_t*)src, MAX, win, MAX);
- end = __rdtsc();
- tot_sse += end - start;
- start = __rdtsc();
- Overlaps_8to32xX_AVX2<MAX, 1>((uint8_t*)sse, MAX, (uint8_t*)src, MAX, win, MAX);
- end = __rdtsc();
- tot_avx += end - start;
- start = __rdtsc();
- Overlaps_4to32xX_SSE41<MAX, 1>((uint8_t*)sse, MAX, (uint8_t*)src, MAX, win, MAX);
- end = __rdtsc();
- tot_s41 += end - start;
- printf("cicli c : %20llu\n"
- "cicli avx: %20llu\n"
- "cicli sse: %20llu\n"
- "cicli s41: %20llu\n"
- "diffenza c/avx: %f\n"
- "diffenza c/sse: %f\n"
- "diffenza c/s41: %f\n"
- "diffenza sse/avx: %f\n",
- tot_c, tot_avx, tot_sse, tot_s41,
- (float)tot_c/(float)tot_avx, (float)tot_c/(float)tot_sse, (float)tot_c/(float)tot_s41, (float)tot_sse/(float)tot_avx);
- tot_c = tot_avx = tot_sse = tot_s41 = 0;
- }
- system("pause"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement