Guest User

Untitled

a guest
May 20th, 2015
257
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.27 KB | None | 0 0
  1. /* Compile the following code with:
  2. clang++ -O3 -mavx -m32 temp2.cpp -o myBinary
  3. The generated code will be efficient if the first option is used in the inner loop,
  4. but will be inefficient if the second option is used */
  5.  
  6.  
  7.  
  8. #include "../iaca-mac64/include/iacaMarks.h"
  9. #include <immintrin.h>
  10. #define START_ANALYSIS IACA_START
  11. #define STOP_ANALYSIS IACA_END
  12.  
  13. int nmax = 100;
  14.  
  15. class _jComplexPairAsVector256
  16. {
  17. // This class represents two complex numbers, storing them in a single 256-bit AVX vector
  18. protected:
  19. __m256d __ab;
  20.  
  21. public:
  22.  
  23. _jComplexPairAsVector256() { }
  24.  
  25. __m256d ab(void) const { return __ab; }
  26. const __m256d *abPtr(void) const { return &__ab; }
  27.  
  28. explicit _jComplexPairAsVector256(const double n) { __ab = _mm256_set_pd(n, n, n, n); }
  29. _jComplexPairAsVector256(__m256d inAB) { __ab = inAB; }
  30.  
  31. _jComplexPairAsVector256& operator += (const _jComplexPairAsVector256 &n) { __ab = _mm256_add_pd(__ab, n.ab()); return *this; }
  32. _jComplexPairAsVector256 operator + (const _jComplexPairAsVector256 &n) const { return _jComplexPairAsVector256(*this) += n; }
  33. _jComplexPairAsVector256& operator -= (const _jComplexPairAsVector256 &n) { __ab = _mm256_sub_pd(__ab, n.ab()); return *this; }
  34. _jComplexPairAsVector256 operator - (const _jComplexPairAsVector256 &n) const { return _jComplexPairAsVector256(*this) -= n; }
  35.  
  36. _jComplexPairAsVector256& operator += (const double &n) { __ab = _mm256_add_pd(__ab, (__m256d){n, 0, n, 0}); return *this; }
  37. _jComplexPairAsVector256 operator + (const double &n) const { return _jComplexPairAsVector256(*this) += n; }
  38. _jComplexPairAsVector256& operator -= (const double &n) { __ab = _mm256_sub_pd(__ab, (__m256d){n, 0, n, 0}); return *this; }
  39. _jComplexPairAsVector256 operator - (const double &n) const { return _jComplexPairAsVector256(*this) -= n; }
  40. _jComplexPairAsVector256& operator *= (double n) { __ab = _mm256_mul_pd(__ab, (__m256d){n, n, n, n}); return *this; }
  41. _jComplexPairAsVector256 operator * (double n) const { return _jComplexPairAsVector256(*this) *= n; }
  42.  
  43. _jComplexPairAsVector256 Negated(double ar, double ai, double br, double bi) const { return _jComplexPairAsVector256(_mm256_mul_pd(__ab, (__m256d) { ar, ai, br, bi })); }
  44. _jComplexPairAsVector256 GetSwappedPairs(void) const { return _jComplexPairAsVector256(__builtin_shufflevector(__ab, __ab, 2, 3, 0, 1)); } // Return { b, a } given { a, b }
  45. _jComplexPairAsVector256 GetSwappedReIm(void) const { return _jComplexPairAsVector256(__builtin_shufflevector(__ab, __ab, 1, 0, 3, 2)); } // Return { im(a), re(a), im(b), re(b) } given { a, b }
  46.  
  47. };
  48. inline _jComplexPairAsVector256 operator*(const double l, const _jComplexPairAsVector256 &r)
  49. {
  50. return r * l;
  51. }
  52.  
  53. #include <stdio.h>
  54. void TempCodeGenerationDemo(const _jComplexPairAsVector256 *FAB_ji, _jComplexPairAsVector256 *input, _jComplexPairAsVector256 *result)
  55. {
  56. /* Ideally I would hide the vector stuff in the jComplexPair implementation
  57. However it is very desirable to promote the horizontal add out of the loop,
  58. since it is an expensive operation. As a result we have to expose some
  59. implementation details in this function.
  60. */
  61. size_t base_in_pos = 0, in_pos;
  62. size_t out_pos = 0, i1 = 0;
  63.  
  64. // General +-m
  65. for (int absM = 1; absM <= nmax; absM++)
  66. {
  67. for (int n = absM; n <= nmax; n++)
  68. {
  69. in_pos = base_in_pos;
  70.  
  71. _jComplexPairAsVector256 sum_ar = _jComplexPairAsVector256(0.0);
  72. _jComplexPairAsVector256 sum_ai = _jComplexPairAsVector256(0.0);
  73. _jComplexPairAsVector256 sum_arm = _jComplexPairAsVector256(0.0);
  74. _jComplexPairAsVector256 sum_aim = _jComplexPairAsVector256(0.0);
  75. for (int l = absM; l <= nmax; l++)
  76. {
  77. START_ANALYSIS
  78.  
  79. // +m
  80. _jComplexPairAsVector256 factors_ab = input[in_pos];
  81. #if 0
  82. double Ar = ((double*)FAB_ji[i1].abPtr())[0];
  83. double Ai = ((double*)FAB_ji[i1].abPtr())[1];
  84. double Br = ((double*)FAB_ji[i1].abPtr())[2];
  85. double Bi = ((double*)FAB_ji[i1].abPtr())[3];
  86. #else
  87. double Ar = FAB_ji[i1].ab()[0];
  88. double Ai = FAB_ji[i1].ab()[1];
  89. double Br = FAB_ji[i1].ab()[2];
  90. double Bi = FAB_ji[i1].ab()[3];
  91. #endif
  92. /* Strategy: extract the four doubles associated with FAB and multiply them in turn with the ab values associated with +m
  93. Do the same for the ab values associated with -m (using the same AB values... we will wait until the inner loop is complete
  94. before we take care of the negations necessary to obtain the correct AB values for -m)
  95. Note that the pair swapping can be done for free using underused execution units, so I am happy to do that in the loop,
  96. which allows us to maintain only two running sums rather than four
  97.  
  98. Note that doing the addition between intermediates and then adding once to sum_ar is better than adding twice to sum_ar
  99. since it improves throughput due to reduced dependencies */
  100. sum_ar += (Ar * factors_ab + Br * factors_ab.GetSwappedPairs());
  101. sum_ai += (Ai * factors_ab + Bi * factors_ab.GetSwappedPairs());
  102.  
  103. // -m
  104. factors_ab = input[in_pos+1];
  105. sum_arm += (Ar * factors_ab - Br * factors_ab.GetSwappedPairs());
  106. sum_aim += (Ai * factors_ab - Bi * factors_ab.GetSwappedPairs());
  107. in_pos += 2;
  108. i1++;
  109. STOP_ANALYSIS
  110. }
  111. result[out_pos++] = (sum_ar + (sum_ai.GetSwappedReIm()).Negated(-1, 1, -1, 1));
  112. result[out_pos++] = ((sum_arm + (sum_aim.GetSwappedReIm()).Negated(-1, 1, -1, 1)));
  113. }
  114. base_in_pos = in_pos;
  115. }
  116. }
  117.  
  118. void test(void)
  119. {
  120. _jComplexPairAsVector256 *a = new _jComplexPairAsVector256[10], *b = new _jComplexPairAsVector256[10], *c = new _jComplexPairAsVector256[10];
  121.  
  122. TempCodeGenerationDemo(a, b, c);
  123. printf("%lf\n", c[40].ab()[0]);
  124. }
  125.  
  126. int main(void)
  127. {
  128. test();
  129. }
Advertisement
Add Comment
Please, Sign In to add comment