Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /* Compile the following code with:
- clang++ -O3 -mavx -m32 temp2.cpp -o myBinary
- The generated code will be efficient if the first option is used in the inner loop,
- but will be inefficient if the second option is used */
- #include "../iaca-mac64/include/iacaMarks.h"
- #include <immintrin.h>
- #define START_ANALYSIS IACA_START
- #define STOP_ANALYSIS IACA_END
- int nmax = 100;
- class _jComplexPairAsVector256
- {
- // This class represents two complex numbers, storing them in a single 256-bit AVX vector
- protected:
- __m256d __ab;
- public:
- _jComplexPairAsVector256() { }
- __m256d ab(void) const { return __ab; }
- const __m256d *abPtr(void) const { return &__ab; }
- explicit _jComplexPairAsVector256(const double n) { __ab = _mm256_set_pd(n, n, n, n); }
- _jComplexPairAsVector256(__m256d inAB) { __ab = inAB; }
- _jComplexPairAsVector256& operator += (const _jComplexPairAsVector256 &n) { __ab = _mm256_add_pd(__ab, n.ab()); return *this; }
- _jComplexPairAsVector256 operator + (const _jComplexPairAsVector256 &n) const { return _jComplexPairAsVector256(*this) += n; }
- _jComplexPairAsVector256& operator -= (const _jComplexPairAsVector256 &n) { __ab = _mm256_sub_pd(__ab, n.ab()); return *this; }
- _jComplexPairAsVector256 operator - (const _jComplexPairAsVector256 &n) const { return _jComplexPairAsVector256(*this) -= n; }
- _jComplexPairAsVector256& operator += (const double &n) { __ab = _mm256_add_pd(__ab, (__m256d){n, 0, n, 0}); return *this; }
- _jComplexPairAsVector256 operator + (const double &n) const { return _jComplexPairAsVector256(*this) += n; }
- _jComplexPairAsVector256& operator -= (const double &n) { __ab = _mm256_sub_pd(__ab, (__m256d){n, 0, n, 0}); return *this; }
- _jComplexPairAsVector256 operator - (const double &n) const { return _jComplexPairAsVector256(*this) -= n; }
- _jComplexPairAsVector256& operator *= (double n) { __ab = _mm256_mul_pd(__ab, (__m256d){n, n, n, n}); return *this; }
- _jComplexPairAsVector256 operator * (double n) const { return _jComplexPairAsVector256(*this) *= n; }
- _jComplexPairAsVector256 Negated(double ar, double ai, double br, double bi) const { return _jComplexPairAsVector256(_mm256_mul_pd(__ab, (__m256d) { ar, ai, br, bi })); }
- _jComplexPairAsVector256 GetSwappedPairs(void) const { return _jComplexPairAsVector256(__builtin_shufflevector(__ab, __ab, 2, 3, 0, 1)); } // Return { b, a } given { a, b }
- _jComplexPairAsVector256 GetSwappedReIm(void) const { return _jComplexPairAsVector256(__builtin_shufflevector(__ab, __ab, 1, 0, 3, 2)); } // Return { im(a), re(a), im(b), re(b) } given { a, b }
- };
- inline _jComplexPairAsVector256 operator*(const double l, const _jComplexPairAsVector256 &r)
- {
- return r * l;
- }
- #include <stdio.h>
- void TempCodeGenerationDemo(const _jComplexPairAsVector256 *FAB_ji, _jComplexPairAsVector256 *input, _jComplexPairAsVector256 *result)
- {
- /* Ideally I would hide the vector stuff in the jComplexPair implementation
- However it is very desirable to promote the horizontal add out of the loop,
- since it is an expensive operation. As a result we have to expose some
- implementation details in this function.
- */
- size_t base_in_pos = 0, in_pos;
- size_t out_pos = 0, i1 = 0;
- // General +-m
- for (int absM = 1; absM <= nmax; absM++)
- {
- for (int n = absM; n <= nmax; n++)
- {
- in_pos = base_in_pos;
- _jComplexPairAsVector256 sum_ar = _jComplexPairAsVector256(0.0);
- _jComplexPairAsVector256 sum_ai = _jComplexPairAsVector256(0.0);
- _jComplexPairAsVector256 sum_arm = _jComplexPairAsVector256(0.0);
- _jComplexPairAsVector256 sum_aim = _jComplexPairAsVector256(0.0);
- for (int l = absM; l <= nmax; l++)
- {
- START_ANALYSIS
- // +m
- _jComplexPairAsVector256 factors_ab = input[in_pos];
- #if 0
- double Ar = ((double*)FAB_ji[i1].abPtr())[0];
- double Ai = ((double*)FAB_ji[i1].abPtr())[1];
- double Br = ((double*)FAB_ji[i1].abPtr())[2];
- double Bi = ((double*)FAB_ji[i1].abPtr())[3];
- #else
- double Ar = FAB_ji[i1].ab()[0];
- double Ai = FAB_ji[i1].ab()[1];
- double Br = FAB_ji[i1].ab()[2];
- double Bi = FAB_ji[i1].ab()[3];
- #endif
- /* Strategy: extract the four doubles associated with FAB and multiply them in turn with the ab values associated with +m
- Do the same for the ab values associated with -m (using the same AB values... we will wait until the inner loop is complete
- before we take care of the negations necessary to obtain the correct AB values for -m)
- Note that the pair swapping can be done for free using underused execution units, so I am happy to do that in the loop,
- which allows us to maintain only two running sums rather than four
- Note that doing the addition between intermediates and then adding once to sum_ar is better than adding twice to sum_ar
- since it improves throughput due to reduced dependencies */
- sum_ar += (Ar * factors_ab + Br * factors_ab.GetSwappedPairs());
- sum_ai += (Ai * factors_ab + Bi * factors_ab.GetSwappedPairs());
- // -m
- factors_ab = input[in_pos+1];
- sum_arm += (Ar * factors_ab - Br * factors_ab.GetSwappedPairs());
- sum_aim += (Ai * factors_ab - Bi * factors_ab.GetSwappedPairs());
- in_pos += 2;
- i1++;
- STOP_ANALYSIS
- }
- result[out_pos++] = (sum_ar + (sum_ai.GetSwappedReIm()).Negated(-1, 1, -1, 1));
- result[out_pos++] = ((sum_arm + (sum_aim.GetSwappedReIm()).Negated(-1, 1, -1, 1)));
- }
- base_in_pos = in_pos;
- }
- }
- void test(void)
- {
- _jComplexPairAsVector256 *a = new _jComplexPairAsVector256[10], *b = new _jComplexPairAsVector256[10], *c = new _jComplexPairAsVector256[10];
- TempCodeGenerationDemo(a, b, c);
- printf("%lf\n", c[40].ab()[0]);
- }
- int main(void)
- {
- test();
- }
Advertisement
Add Comment
Please, Sign In to add comment