Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- __m256i k01 = ...;
- __m256i k23 = ...;
- __m256i k45 = ...;
- __m256i k67 = ...;
- __m256i one = _mm256_set1_epi16(1);
- __m256i acc16a, acc16b, acc16c, acc16d, acc32lo, acc32hi;
- acc32lo = _mm256_setzero_si256();
- acc32hi = _mm256_setzero_si256();
- // 4x VPMADDUBSW 4x VPMADDWD, 4x VPUNPCK[LH]WD, 4x VPADDD
- // 4p01, 4p01, 4p5, 4p015 --> 8p01 4p5 4p015
- // Best case execution: 6p0 6p1 4p5
- acc16a = _mm256_maddubs_epi16(k01, _mm256_load_si256(...));
- acc16b = _mm256_maddubs_epi16(k23, _mm256_load_si256(...));
- acc16c = _mm256_maddubs_epi16(k45, _mm256_load_si256(...));
- acc16d = _mm256_maddubs_epi16(k67, _mm256_load_si256(...));
- acc32lo = _mm256_add_epi32(acc32lo, _mm256_madd_epi16(_mm256_unpacklo_epi16(acc16a, acc16b), one));
- acc32hi = _mm256_add_epi32(acc32hi, _mm256_madd_epi16(_mm256_unpackhi_epi16(acc16a, acc16b), one));
- acc32lo = _mm256_add_epi32(acc32lo, _mm256_madd_epi16(_mm256_unpacklo_epi16(acc16c, acc16d), one));
- acc32hi = _mm256_add_epi32(acc32hi, _mm256_madd_epi16(_mm256_unpackhi_epi16(acc16c, acc16d), one));
- _mm256_store_si256(..., _mm256_permute2x128_si256(acc32lo, acc32hi, ...));
- _mm256_store_si256(..., _mm256_permute2x128_si256(acc32lo, acc32hi, ...));
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement