Advertisement
zhangsongcui

Sum all bytes of a ymm register

Jan 6th, 2019
181
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 1.09 KB | None | 0 0
  1. #include <immintrin.h>
  2.  
  3. int add(__m256i start) {
  4.     __m256i one = _mm256_set1_epi8(1);
  5.     __m256i zero = _mm256_setzero_si256();
  6.     __m256i ymm1 = _mm256_maddubs_epi16(start, one);
  7.     __m256i ymm2 = _mm256_hadd_epi16(ymm1, zero);
  8.     __m256i ymm3 = _mm256_hadd_epi16(ymm2, zero);
  9.     __m256i ymm4 = _mm256_hadd_epi16(ymm3, zero);
  10.     return _mm256_extract_epi32(ymm4, 0) + _mm256_extract_epi32(ymm4, 4);
  11. }
  12.  
  13. int add2(__m256i start) {
  14.     __m256i one = _mm256_set1_epi8(1);
  15.     __m256i zero = _mm256_setzero_si256();
  16.     __m256i ymm1 = _mm256_maddubs_epi16(start, one);
  17.     __m256i ymm2 = _mm256_hadd_epi16(ymm1, zero);
  18.     __m256i ymm3 = _mm256_hadd_epi16(ymm2, zero);
  19.     __m256i ymm4 = _mm256_hadd_epi16(ymm3, zero);
  20.     __m256i ymm5 = _mm256_permute4x64_epi64(ymm4, _MM_SHUFFLE(0, 0, 0, 2));
  21.     __m256i ymm6 = _mm256_add_epi64(ymm4, ymm5);
  22.     return _mm256_extract_epi32(ymm6, 0);
  23. }
  24.  
  25. int add3(__m256i start) {
  26.     char buf[32];
  27.     _mm256_storeu_si256((__m256i *)buf, start);
  28.     int res = 0;
  29.     for (int i = 0; i < 32; ++i) {
  30.         res += buf[i];
  31.     }
  32.     return res;
  33. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement