Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- CPU: Intel Xeon E3-1240v3 @3.40GHz
- Test code:
- #include <emmintrin.h>
- #include <immintrin.h>
- #include <avx2intrin.h>
- #include <cstdint>
- #include <random>
- #include <cstdio>
- #include <intrin.h>
- #ifndef min
- #define min(a,b) (((a) < (b)) ? (a) : (b))
- #endif
- #include "../iaca-win64/iacaMarks.h"
- #define avx 1
- #define IACA_MARKS_OFF
- typedef uint32_t PixelType2;
- typedef uint16_t PixelType;
- __inline __m256i _mm256_loadu2_m128i(__m128i *low, __m128i *high)
- {
- return _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)low)),
- _mm_loadu_si128((__m128i*)high),1);
- }
- void ToPixels_orig(uint8_t *pDst8, int nDstPitch, const uint8_t *pSrc8, int nSrcPitch, int nWidth, int nHeight, int bitsPerSample)
- {
- int pixelMax = (1 << bitsPerSample) - 1;
- for (int h=0; h<nHeight; h++)
- {
- const PixelType2 *pSrc = (const PixelType2 *)pSrc8;
- PixelType *pDst = (PixelType *)pDst8;
- for (int i=0; i<nWidth; i++)
- {
- int a = (pSrc[i] + 16)>>5;
- if (sizeof(PixelType) == 1)
- pDst[i] = a | ((255-a) >> (sizeof(int)*8-1));
- else
- pDst[i] = min(pixelMax, a);
- }
- pDst8 += nDstPitch;
- pSrc8 += nSrcPitch;
- }
- }
- void ToPixels_AVX2_16bit(uint8_t *pDst8, int nDstPitch, const uint8_t *pSrc8, int nSrcPitch, int nWidth, int nHeight, int bitsPerSample)
- {
- static __m256i const sixteen = _mm256_set1_epi32(16);
- static __m128i const five = _mm_set1_epi64x(5);
- for (int h=0; h<nHeight; h++)
- {
- const uint32_t *pSrc = (const uint32_t *)pSrc8;
- uint16_t *pDst = (uint16_t *)pDst8;
- nWidth_16 = (nWidth & ~15);
- for (int i = 0; i < nWidth_16; i += 16)
- {
- __m256i src_1 = _mm256_loadu2_m128i((__m128i* )(pSrc + i), (__m128i* )(pSrc + i + 8));
- __m256i src_2 = _mm256_loadu2_m128i((__m128i* )(pSrc + i + 4), (__m128i* )(pSrc + i + 12));
- __m256i a1 = _mm256_srl_epi32(_mm256_add_epi32(src_1, sixteen), five);
- __m256i a2 = _mm256_srl_epi32(_mm256_add_epi32(src_2, sixteen), five);
- _mm256_storeu_si256((__m256i* )(pDst + i), _mm256_packus_epi32(a1, a2));
- }
- for (int i = nWidth_16; nWidth_16 < nWidth; i++)
- {
- int a = (pSrc[i] + 16)>>5;
- pDst[i] = min(pixelMax, a);
- }
- pDst8 += nDstPitch;
- pSrc8 += nSrcPitch;
- }
- }
- void ToPixels_SSE2_16bit(uint8_t *pDst8, int nDstPitch, const uint8_t *pSrc8, int nSrcPitch, int nWidth, int nHeight, int bitsPerSample)
- {
- static __m128i const sixteen = _mm_set1_epi32(16);
- static __m128i const five = _mm_set1_epi64x(5);
- for (int h=0; h<nHeight; h++)
- {
- const uint32_t *pSrc = (const uint32_t *)pSrc8;
- uint16_t *pDst = (uint16_t *)pDst8;
- nWidth_8 = (nWidth & ~7);
- for (int i = 0; i < nWidth_8; i += 8)
- {
- __m128i src_1 = _mm_lddqu_si128((__m128i* )(pSrc + i));
- __m128i src_2 = _mm_lddqu_si128((__m128i* )(pSrc + i + 4));
- __m128i a1 = _mm_srl_epi32(_mm_add_epi32(src_1, sixteen), five);
- __m128i a2 = _mm_srl_epi32(_mm_add_epi32(src_2, sixteen), five);
- _mm_storeu_si128((__m128i* )(pDst + i), _mm_packus_epi32(a1, a2));
- }
- for (int i = nWidth_8; nWidth_8 < nWidth; i++)
- {
- int a = (pSrc[i] + 16)>>5;
- pDst[i] = min(pixelMax, a);
- }
- pDst8 += nDstPitch;
- pSrc8 += nSrcPitch;
- }
- }
- uint64_t const MAX = 4294967296;
- uint32_t const BATCH = 65536;
- uint8_t const ITER = 10;
- using namespace std;
- int main()
- {
- uint32_t *src = new uint32_t[BATCH];
- uint16_t *dst = new uint16_t[BATCH];
- uint16_t *sse = new uint16_t[BATCH];
- uint64_t tot_c = 0, tot_avx = 0, tot_sse = 0;
- uint64_t start, end;
- for(uint8_t j = 0; j < ITER; j++)
- {
- for(uint32_t iter = 0; iter < BATCH; iter++)
- {
- for(uint64_t i = 0; i < BATCH; i++)
- src[i] = BATCH * iter + i;
- start = __rdtsc();
- ToPixels_orig((uint8_t*)dst, BATCH, (uint8_t*)src, BATCH, BATCH, 1, 16);
- end = __rdtsc();
- tot_c += end - start;
- start = __rdtsc();
- ToPixels_SSE2_16bit((uint8_t*)sse, BATCH, (uint8_t*)src, BATCH, BATCH, 1, 16);
- end = __rdtsc();
- tot_sse += end - start;
- start = __rdtsc();
- ToPixels_AVX2_16bit((uint8_t*)sse, BATCH, (uint8_t*)src, BATCH, BATCH, 1, 16);
- end = __rdtsc();
- tot_avx += end - start;
- }
- printf("cicli c : %20llu\n"
- "cicli avx: %20llu\n"
- "cicli sse: %20llu\n"
- "diffenza c/avx: %f\n"
- "diffenza c/sse: %f\n"
- "diffenza sse/avx: %f\n", tot_c, tot_avx, tot_sse, (float)tot_c/(float)tot_avx, (float)tot_c/(float)tot_sse, (float)tot_sse/(float)tot_avx);
- }
- system("pause");
- return 0;
- }
- -O2 -march=core-avx2
- cicli c : 16069028936
- cicli avx: 3479067731
- cicli sse: 3952420122
- diffenza c/avx: 4.618774
- diffenza c/sse: 4.065618
- diffenza sse/avx: 1.136057
- cicli c : 36326105778
- cicli avx: 7875570050
- cicli sse: 8945499109
- diffenza c/avx: 4.612505
- diffenza c/sse: 4.060825
- diffenza sse/avx: 1.135854
- cicli c : 53213062764
- cicli avx: 11534067363
- cicli sse: 13100670525
- diffenza c/avx: 4.613555
- diffenza c/sse: 4.061858
- diffenza sse/avx: 1.135824
- cicli c : 67941010573
- cicli avx: 14727013850
- cicli sse: 16725254304
- diffenza c/avx: 4.613360
- diffenza c/sse: 4.062181
- diffenza sse/avx: 1.135685
- cicli c : 89191667408
- cicli avx: 19335612709
- cicli sse: 21957883495
- diffenza c/avx: 4.612818
- diffenza c/sse: 4.061943
- diffenza sse/avx: 1.135619
- cicli c : 109229600089
- cicli avx: 23575137811
- cicli sse: 26813487597
- diffenza c/avx: 4.633254
- diffenza c/sse: 4.073681
- diffenza sse/avx: 1.137363
- cicli c : 122392466376
- cicli avx: 26355151378
- cicli sse: 29993017323
- diffenza c/avx: 4.643968
- diffenza c/sse: 4.080699
- diffenza sse/avx: 1.138032
- cicli c : 139711839407
- cicli avx: 30032840475
- cicli sse: 34210571303
- diffenza c/avx: 4.651969
- diffenza c/sse: 4.083879
- diffenza sse/avx: 1.139105
- cicli c : 154549375786
- cicli avx: 33205606096
- cicli sse: 37834240435
- diffenza c/avx: 4.654315
- diffenza c/sse: 4.084908
- diffenza sse/avx: 1.139393
- cicli c : 167509109185
- cicli avx: 36013172486
- cicli sse: 41023592505
- diffenza c/avx: 4.651329
- diffenza c/sse: 4.083239
- diffenza sse/avx: 1.139127
- -O3 -march=core-avx2
- cicli c : 6871242081
- cicli avx: 5664816544
- cicli sse: 6254182296
- diffenza c/avx: 1.212968
- diffenza c/sse: 1.098664
- diffenza sse/avx: 1.104040
- cicli c : 12683725691
- cicli avx: 10469204071
- cicli sse: 11560038037
- diffenza c/avx: 1.211527
- diffenza c/sse: 1.097204
- diffenza sse/avx: 1.104195
- cicli c : 17847899606
- cicli avx: 14773516038
- cicli sse: 16374322201
- diffenza c/avx: 1.208101
- diffenza c/sse: 1.089993
- diffenza sse/avx: 1.108356
- cicli c : 23062613551
- cicli avx: 19120545569
- cicli sse: 21234215418
- diffenza c/avx: 1.206169
- diffenza c/sse: 1.086106
- diffenza sse/avx: 1.110544
- cicli c : 27206401088
- cicli avx: 22573441790
- cicli sse: 25082508148
- diffenza c/avx: 1.205239
- diffenza c/sse: 1.084676
- diffenza sse/avx: 1.111151
- cicli c : 31937676651
- cicli avx: 26493333877
- cicli sse: 29447055320
- diffenza c/avx: 1.205499
- diffenza c/sse: 1.084580
- diffenza sse/avx: 1.111489
- cicli c : 37115015173
- cicli avx: 30810575815
- cicli sse: 34294225401
- diffenza c/avx: 1.204619
- diffenza c/sse: 1.082253
- diffenza sse/avx: 1.113067
- cicli c : 41459165763
- cicli avx: 34391557986
- cicli sse: 38270611514
- diffenza c/avx: 1.205504
- diffenza c/sse: 1.083316
- diffenza sse/avx: 1.112791
- cicli c : 47518820008
- cicli avx: 39417068102
- cicli sse: 43835101728
- diffenza c/avx: 1.205539
- diffenza c/sse: 1.084036
- diffenza sse/avx: 1.112084
- cicli c : 53266037373
- cicli avx: 44166766473
- cicli sse: 49077206995
- diffenza c/avx: 1.206021
- diffenza c/sse: 1.085352
- diffenza sse/avx: 1.111180
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement