Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <iostream>
- #include <random>
- #include <tmmintrin.h>
- #include <x86intrin.h>
- #include "iacaMarks.h"
- using namespace std;
- int const ITER = 100000000;
- int const ITER2 = 10000000;
- #define AVX
- #define SSE
- #define TEST
- #define TEST_AVX
- #ifdef SSE
- inline __m128 _mm_abs_ps(__m128 x)
- {
- static __m128 const Mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));
- __m128 abs = _mm_and_ps(Mask, x);
- return abs;
- }
- #endif // SSE
- #ifdef AVX
- inline __m256 _mm256_abs_ps(__m256 x)
- {
- static __m256 const Mask = _mm256_castsi256_ps(_mm256_set1_epi32(~0x80000000));
- __m256 abs = _mm256_and_ps(Mask, x);
- return abs;
- }
- #endif // AVX
- union FI{
- float f;
- uint32_t i;
- };
- inline float ffabs(float a)
- {
- /*
- FI fi;
- fi.f = a;
- fi.i &= (~0x80000000);
- return fi.f;
- */
- uint32_t i = *(int*)&a & (~0x80000000);
- return *(float*)&i;
- }
- #ifndef TEST
- #undef IACA_END
- #define IACA_END
- #undef IACA_START
- #define IACA_START
- #endif // TEST
- #define IACA_SART_AVX IACA_START
- #define IACA_END_AVX IACA_END
- #define IACA_SART_SSE IACA_START
- #define IACA_END_SSE IACA_END
- int main()
- {
- /*
- mt19937 randGen(42);
- float *srcp = new float[ITER];
- float *thrp = new float[ITER];
- uniform_real_distribution<float> distribution(-123453,123453);
- for(int j = 0; j < 20; j++)
- {
- int retainedCoefs = 0;
- int retainedCoefsAbs = 0;
- cout<<"batch: "<<j<<endl;
- for (int i = 0; i < ITER; i++)
- {
- srcp[i] = distribution(randGen);
- thrp[i] = distribution(randGen);
- }
- for (int i = 0; i < ITER; i++)
- {
- if (srcp[i] > thrp[i] || srcp[i] < -thrp[i])
- {
- ++retainedCoefs;
- }
- }
- for (int i = 0; i < ITER; i++)
- {
- if (fabs(srcp[i]) > thrp[i])
- {
- ++retainedCoefsAbs;
- }
- }
- cout<<retainedCoefs<<endl<<retainedCoefsAbs<<endl;
- if(retainedCoefs != retainedCoefsAbs)
- {
- system("pause");
- }
- }
- */
- /*
- __m128i cmp_sum = _mm_set_epi32(1,2,3,4);
- int retainedCoefs = 0;
- volatile int iter = 10000000;
- uint64_t start = __rdtsc();
- for(int i = 0; i < iter; ++i)
- {
- alignas(16) int32_t cmp_sum_i32[4];
- _mm_store_si128(reinterpret_cast<__m128i *>(cmp_sum_i32), cmp_sum);
- retainedCoefs += cmp_sum_i32[0] + cmp_sum_i32[1] + cmp_sum_i32[2] + cmp_sum_i32[3];
- }
- uint64_t end = __rdtsc();
- cout<<retainedCoefs<<endl;
- cout<<end-start<<endl;
- retainedCoefs = 0;
- start = __rdtsc();
- for(int i = 0; i < iter; ++i)
- {
- __m128i cmp_sum_t = _mm_hadd_epi32(cmp_sum, cmp_sum);
- cmp_sum_t = _mm_hadd_epi32(cmp_sum_t, cmp_sum_t);
- retainedCoefs += _mm_extract_epi32(cmp_sum_t, 0);
- }
- end = __rdtsc();
- cout<<retainedCoefs<<endl;
- cout<<end-start<<endl;
- */
- /*
- mt19937 randGen(42);
- float refp;
- uniform_real_distribution<float> distribution(0,1);
- for(int j = 0; j < 20; j++)
- {
- float wienerCoef = 0;
- float wienerCoef_opt = 0;
- const float sigmaSquare = distribution(randGen);
- const float rcp_sigmaSquare = 1/sigmaSquare;
- cout<<"batch: "<<j<<endl;
- //for (int i = 0; i < ITER; i++)
- {
- //srcp[i] = distribution(randGen);
- refp = distribution(randGen);
- }
- //for (int i = 0; i < ITER; i++)
- {
- const float refSquare = refp * refp;
- wienerCoef = refSquare / (refSquare + sigmaSquare);
- }
- //for (int i = 0; i < ITER; i++)
- {
- const float refSquare = refp * refp;
- wienerCoef_opt = 1/(1 + (refSquare * rcp_sigmaSquare));
- }
- cout<<wienerCoef<<endl<<wienerCoef_opt<<endl;
- }
- */
- mt19937 randGen(42);
- //float *srcp = new float[ITER];
- float *srcpC= new float[ITER];
- float *srcpS= new float[ITER];
- float *srcpA= new float[ITER];
- float *thrp = new float[ITER];
- int retainedCoefsC = 0, retainedCoefsS = 0, retainedCoefsA = 0;
- uint64_t CC, CS, CA;
- uint64_t start, end;
- uniform_real_distribution<float> distribution(-1.0f, 1.0f);
- for(int j = 0; j < 20; j++)
- {
- cout<<"batch: "<<j<<endl;
- CC = 0; CS = 0; CA = 0;
- for (int i = 0; i < ITER; i++)
- {
- srcpC[i] = distribution(randGen);
- srcpS[i] = srcpC[i];
- srcpA[i] = srcpC[i];
- thrp[i] = distribution(randGen);
- }
- #ifdef SSE
- start = __rdtsc();
- //IACA_SART_SSE;
- //SSE2
- {
- static const ptrdiff_t simd_step = 4;
- __m128i cmp_sum = _mm_setzero_si128();
- __m128 zero = _mm_setzero_ps();
- for (int i = 0; i < ITER; i += simd_step)
- {
- const __m128 s1 = _mm_load_ps(&srcpS[i]);
- const __m128 t1p = _mm_load_ps(&thrp[i]);
- const __m128 abs = _mm_abs_ps(s1);
- const __m128 cmp = _mm_cmpgt_ps(abs, t1p);
- const __m128 d1 = _mm_and_ps(cmp, s1);
- _mm_store_ps(&srcpS[i], d1);
- cmp_sum = _mm_sub_epi32(cmp_sum, _mm_castps_si128(cmp));
- }
- alignas(16) int32_t cmp_sum_i32[4];
- _mm_store_si128(reinterpret_cast<__m128i *>(cmp_sum_i32), cmp_sum);
- retainedCoefsS += cmp_sum_i32[0] + cmp_sum_i32[1] + cmp_sum_i32[2] + cmp_sum_i32[3];
- }
- //IACA_END_SSE;
- end = __rdtsc();
- CS = end - start;
- cout<<retainedCoefsS<<endl;
- #endif //SSE
- #ifdef AVX
- start = __rdtsc();
- //AVX
- {
- static const ptrdiff_t simd_step = 8;
- __m128i cmp_sum = _mm_setzero_si128();
- __m128i cmp_0, cmp_1;
- //__m256i cmp_sum256 = _mm256_setzero_si256();
- for (int i = 0; i < ITER; i += simd_step)
- {
- //IACA_SART_AVX;
- __m256 s1 = _mm256_load_ps(&srcpA[i]);
- __m256 t1 = _mm256_load_ps(&thrp[i]);
- __m256 abs = _mm256_abs_ps(s1);
- __m256 cmp = _mm256_cmp_ps(abs, t1, _CMP_GT_OQ);
- __m256 d1 = _mm256_and_ps(cmp, s1);
- _mm256_storeu_ps(&srcpA[i], d1);
- //cmp_sum256 = _mm256_sub_epi32(cmp_sum256, _mm256_castps_si256(cmp));
- cmp_0 = _mm256_extractf128_si256(_mm256_castps_si256(cmp), 0);
- cmp_1 = _mm256_extractf128_si256(_mm256_castps_si256(cmp), 1);
- _mm256_zeroupper();
- cmp_sum = _mm_sub_epi32(cmp_sum, cmp_0);
- cmp_sum = _mm_sub_epi32(cmp_sum, cmp_1);
- }
- /*
- alignas(32) int32_t cmp_sum_i32[8];
- _mm256_store_si256(reinterpret_cast<__m256i *>(cmp_sum_i32), cmp_sum256);
- retainedCoefsA += cmp_sum_i32[0] + cmp_sum_i32[1] + cmp_sum_i32[2] + cmp_sum_i32[3]+\
- cmp_sum_i32[4] + cmp_sum_i32[5] + cmp_sum_i32[6] + cmp_sum_i32[7];
- */
- //IACA_END_AVX;
- alignas(16) int32_t cmp_sum_i32[4];
- _mm_store_si128(reinterpret_cast<__m128i *>(cmp_sum_i32), cmp_sum);
- retainedCoefsA += cmp_sum_i32[0] + cmp_sum_i32[1] + cmp_sum_i32[2] + cmp_sum_i32[3];
- }
- end = __rdtsc();
- CA = end - start;
- cout<<retainedCoefsA<<endl;
- #endif
- start = __rdtsc();
- //C
- {
- for (int i = 0; i < ITER; ++i)
- {
- //if (srcpC[i] > thrp[i] || srcpC[i] < -thrp[i])
- if(ffabs(srcpC[i]) > thrp[i])
- {
- ++retainedCoefsC;
- }
- else
- {
- srcpC[i] = 0;
- }
- }
- }
- end = __rdtsc();
- CC = end - start;
- cout<<retainedCoefsC<<endl;
- printf("Cicli C : %f\n"
- "Cicli SSE: %f\n"
- "Cicli AVX: %f\n"
- "Guadagno SSE/AVX: %f\n",
- (float)CC/(float)ITER, CS/(float)ITER*4, CA/(float)ITER*8, (float)CS/(float)CA
- );
- }
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement