Advertisement
Guest User

Untitled

a guest
Mar 19th, 2019
61
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.02 KB | None | 0 0
  1. static word neg_word(warp all_ones, warp one, word value)
  2. {
  3. #ifdef __AVX2__
  4. const __m256i lane64_one = _mm256_broadcastq_epi64(_mm256_castsi256_si128(one));
  5. const __m256i maxu64 = _mm256_sub_epi64(_mm256_setzero_si256(), one);
  6. //
  7. const __m256i notv = _mm256_andnot_si256(value, all_ones);
  8. const __m256i mask0 = _mm256_cmpeq_epi64(all_ones, notv);
  9. const __m256i temp = _mm256_permute4x64_epi64(mask0, _MM_SHUFFLE(2, 1, 0, 0));
  10. const __m256i mask1 = _mm256_blendv_epi8(temp, maxu64, maxu64);
  11. const __m256i mask2 = _mm256_permute4x64_epi64(mask1, _MM_SHUFFLE(2, 1, 0, 0));
  12. const __m256i mask3 = _mm256_permute4x64_epi64(mask1, _MM_SHUFFLE(1, 0, 0, 0));
  13. const __m256i mask = _mm256_and_si256(_mm256_and_si256(mask1, mask2), mask3);
  14. const __m256i carry = _mm256_and_si256(mask, lane64_one);
  15. const __m256i result = _mm256_add_epi64(notv, carry);
  16. return result;
  17. #elif __SSSE3__
  18. const __m128i maxu64 = _mm_sub_epi64(_mm_setzero_si128(), one);
  19. const __m128i lane64_one = _mm_shuffle_epi32(one, _MM_SHUFFLE(1, 0, 1, 0));
  20. //
  21. const __m128i notvlo = _mm_andnot_si128(value.lo, all_ones);
  22. const __m128i notvhi = _mm_andnot_si128(value.hi, all_ones);
  23. const __m128i mask0lo = mm_cmpeq_epi64(all_ones, notvlo);
  24. const __m128i mask0hi = mm_cmpeq_epi64(all_ones, notvhi);
  25. const __m128i mask1lo = _mm_or_si128(_mm_bslli_si128(mask0lo, 8), maxu64);
  26. const __m128i mask1hi = _mm_unpackhi_epi64(mask0lo, _mm_bslli_si128(mask0hi, 8));
  27. const __m128i mask2lo = all_ones;
  28. const __m128i mask2hi = mask0lo;
  29. const __m128i mask3lo = all_ones;
  30. const __m128i mask3hi = _mm_or_si128(_mm_bslli_si128(mask0lo, 8), maxu64);
  31. const __m128i masklo = _mm_and_si128(_mm_and_si128(mask1lo, mask2lo), mask3lo);
  32. const __m128i maskhi = _mm_and_si128(_mm_and_si128(mask1hi, mask2hi), mask3hi);
  33. const __m128i carrylo = _mm_and_si128(masklo, lane64_one);
  34. const __m128i carryhi = _mm_and_si128(maskhi, lane64_one);
  35. const __m128i lo = _mm_add_epi32(notvlo, carrylo);
  36. const __m128i hi = _mm_add_epi32(notvhi, carryhi);
  37. return (word){ lo, hi };
  38. #endif
  39. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement