SHARE
TWEET

Untitled

a guest Sep 11th, 2019 82 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. __m256i k01 = ...;
  2. __m256i k23 = ...;
  3. __m256i k45 = ...;
  4. __m256i k67 = ...;
  5. __m256i one = _mm256_set1_epi16(1);
  6.  
  7. __m256i acc16a, acc16b, acc16c, acc16d, acc32lo, acc32hi;
  8.  
  9. acc32lo = _mm256_setzero_si256();
  10. acc32hi = _mm256_setzero_si256();
  11.  
  12. // 4x VPMADDUBSW 4x VPMADDWD, 4x VPUNPCK[LH]WD, 4x VPADDD
  13. // 4p01, 4p01, 4p5, 4p015 --> 8p01 4p5 4p015
  14. // Best case execution: 6p0 6p1 4p5
  15. acc16a = _mm256_maddubs_epi16(k01, _mm256_load_si256(...));
  16. acc16b = _mm256_maddubs_epi16(k23, _mm256_load_si256(...));
  17. acc16c = _mm256_maddubs_epi16(k45, _mm256_load_si256(...));
  18. acc16d = _mm256_maddubs_epi16(k67, _mm256_load_si256(...));
  19.  
  20. acc32lo = _mm256_add_epi32(acc32lo, _mm256_madd_epi16(_mm256_unpacklo_epi16(acc16a, acc16b), one));
  21. acc32hi = _mm256_add_epi32(acc32hi, _mm256_madd_epi16(_mm256_unpackhi_epi16(acc16a, acc16b), one));
  22. acc32lo = _mm256_add_epi32(acc32lo, _mm256_madd_epi16(_mm256_unpacklo_epi16(acc16c, acc16d), one));
  23. acc32hi = _mm256_add_epi32(acc32hi, _mm256_madd_epi16(_mm256_unpackhi_epi16(acc16c, acc16d), one));
  24.  
  25. _mm256_store_si256(..., _mm256_permute2x128_si256(acc32lo, acc32hi, ...));
  26. _mm256_store_si256(..., _mm256_permute2x128_si256(acc32lo, acc32hi, ...));
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top