Untitled

/*	Compile the following code with:
		clang++ -O3 -mavx -m32 temp2.cpp -o myBinary
	The generated code will be efficient if the first option is used in the inner loop,
	but will be inefficient if the second option is used	*/


#include "../iaca-mac64/include/iacaMarks.h"
#include <immintrin.h>
#define START_ANALYSIS IACA_START
#define STOP_ANALYSIS IACA_END

int nmax = 100;

class _jComplexPairAsVector256
{
    // This class represents two complex numbers, storing them in a single 256-bit AVX vector
protected:
    __m256d __ab;

public:

    _jComplexPairAsVector256() { }

    __m256d ab(void) const { return __ab; }
    const __m256d *abPtr(void) const { return &__ab; }

    explicit _jComplexPairAsVector256(const double n) { __ab = _mm256_set_pd(n, n, n, n); }
    _jComplexPairAsVector256(__m256d inAB) { __ab = inAB; }

    _jComplexPairAsVector256& operator += (const _jComplexPairAsVector256 &n) { __ab = _mm256_add_pd(__ab, n.ab()); return *this; }
    _jComplexPairAsVector256 operator + (const _jComplexPairAsVector256 &n) const { return _jComplexPairAsVector256(*this) += n; }
    _jComplexPairAsVector256& operator -= (const _jComplexPairAsVector256 &n) { __ab = _mm256_sub_pd(__ab, n.ab()); return *this; }
    _jComplexPairAsVector256 operator - (const _jComplexPairAsVector256 &n) const { return _jComplexPairAsVector256(*this) -= n; }

    _jComplexPairAsVector256& operator += (const double &n) { __ab = _mm256_add_pd(__ab, (__m256d){n, 0, n, 0}); return *this; }
    _jComplexPairAsVector256 operator + (const double &n) const { return _jComplexPairAsVector256(*this) += n; }
    _jComplexPairAsVector256& operator -= (const double &n) { __ab = _mm256_sub_pd(__ab, (__m256d){n, 0, n, 0}); return *this; }
    _jComplexPairAsVector256 operator - (const double &n) const { return _jComplexPairAsVector256(*this) -= n; }
    _jComplexPairAsVector256& operator *= (double n) { __ab = _mm256_mul_pd(__ab, (__m256d){n, n, n, n}); return *this; }
    _jComplexPairAsVector256 operator * (double n) const { return _jComplexPairAsVector256(*this) *= n; }

    _jComplexPairAsVector256 Negated(double ar, double ai, double br, double bi) const { return _jComplexPairAsVector256(_mm256_mul_pd(__ab, (__m256d) { ar, ai, br, bi })); }
    _jComplexPairAsVector256 GetSwappedPairs(void) const { return _jComplexPairAsVector256(__builtin_shufflevector(__ab, __ab, 2, 3, 0, 1)); }	// Return { b, a } given { a, b }
    _jComplexPairAsVector256 GetSwappedReIm(void) const { return _jComplexPairAsVector256(__builtin_shufflevector(__ab, __ab, 1, 0, 3, 2)); }	// Return { im(a), re(a), im(b), re(b) } given { a, b }

};
inline _jComplexPairAsVector256 operator*(const double l, const _jComplexPairAsVector256 &r)
{
    return r * l;
}

#include <stdio.h>
void TempCodeGenerationDemo(const _jComplexPairAsVector256 *FAB_ji, _jComplexPairAsVector256 *input, _jComplexPairAsVector256 *result)
{
    /*  Ideally I would hide the vector stuff in the jComplexPair implementation
     However it is very desirable to promote the horizontal add out of the loop,
     since it is an expensive operation. As a result we have to expose some
     implementation details in this function.
     */
    size_t base_in_pos = 0, in_pos;
    size_t out_pos = 0, i1 = 0;

    // General +-m
    for (int absM = 1; absM <= nmax; absM++)
    {
        for (int n = absM; n <= nmax; n++)
        {
            in_pos = base_in_pos;

            _jComplexPairAsVector256	sum_ar = _jComplexPairAsVector256(0.0);
            _jComplexPairAsVector256	sum_ai = _jComplexPairAsVector256(0.0);
            _jComplexPairAsVector256	sum_arm = _jComplexPairAsVector256(0.0);
            _jComplexPairAsVector256	sum_aim = _jComplexPairAsVector256(0.0);
            for (int l = absM; l <= nmax; l++)
            {
                START_ANALYSIS

                // +m
                _jComplexPairAsVector256	factors_ab = input[in_pos];
#if 0
                double Ar = ((double*)FAB_ji[i1].abPtr())[0];
                double Ai = ((double*)FAB_ji[i1].abPtr())[1];
                double Br = ((double*)FAB_ji[i1].abPtr())[2];
                double Bi = ((double*)FAB_ji[i1].abPtr())[3];
#else
                double Ar = FAB_ji[i1].ab()[0];
                double Ai = FAB_ji[i1].ab()[1];
                double Br = FAB_ji[i1].ab()[2];
                double Bi = FAB_ji[i1].ab()[3];
#endif
                /*  Strategy: extract the four doubles associated with FAB and multiply them in turn with the ab values associated with +m
                 Do the same for the ab values associated with -m (using the same AB values... we will wait until the inner loop is complete
                 before we take care of the negations necessary to obtain the correct AB values for -m)
                 Note that the pair swapping can be done for free using underused execution units, so I am happy to do that in the loop,
                 which allows us to maintain only two running sums rather than four

                 Note that doing the addition between intermediates and then adding once to sum_ar is better than adding twice to sum_ar
                 since it improves throughput due to reduced dependencies    */
                sum_ar += (Ar * factors_ab + Br * factors_ab.GetSwappedPairs());
                sum_ai += (Ai * factors_ab + Bi * factors_ab.GetSwappedPairs());

                // -m
                factors_ab = input[in_pos+1];
                sum_arm += (Ar * factors_ab - Br * factors_ab.GetSwappedPairs());
                sum_aim += (Ai * factors_ab - Bi * factors_ab.GetSwappedPairs());
                in_pos += 2;
                i1++;
                STOP_ANALYSIS
            }
            result[out_pos++] = (sum_ar + (sum_ai.GetSwappedReIm()).Negated(-1, 1, -1, 1));
            result[out_pos++] = ((sum_arm + (sum_aim.GetSwappedReIm()).Negated(-1, 1, -1, 1)));
        }
        base_in_pos = in_pos;
    }
}

void test(void)
{
    _jComplexPairAsVector256 *a = new _jComplexPairAsVector256[10], *b = new _jComplexPairAsVector256[10], *c = new _jComplexPairAsVector256[10];

    TempCodeGenerationDemo(a, b, c);
    printf("%lf\n", c[40].ab()[0]);
}

int main(void)
{
    test();
}