Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // 256-bit ver
- __m256i k0123 = ...;
- __m256i k4567 = ...;
- __m256i acc32 = _mm256_setzero_si256();
- // 2x VPDPBUSD
- // 2p01
- // Best case execution: 1p0 1p1
- acc32 = _mm256_dpbusd_epi32(acc32, k0123, _mm256_load_si256(...));
- acc32 = _mm256_dpbusd_epi32(acc32, k4567, _mm256_load_si256(...));
- _mm256_store_si256(..., acc32);
- // 512-bit ver
- __m512i k0123 = ...;
- __m512i k4567 = ...;
- __m512i acc32 = _mm512_setzero_si512();
- // 2x VPDPBUSD
- // Port breakdown
- // 1FMA: 2p0
- // 2FMA: 2p05
- // Best execution
- // 1FMA: 2p0
- // 2FMA: 1p0 1p5
- acc32 = _mm512_dpbusd_epi32(acc32, k0123, _mm512_load_si512(...));
- acc32 = _mm512_dpbusd_epi32(acc32, k4567, _mm512_load_si512(...));
- _mm512_store_si512(..., acc32);
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement