Advertisement
Guest User

Untitled

a guest
Nov 22nd, 2019
418
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.88 KB | None | 0 0
  1. long long int sum_simd_unrolled(unsigned int vals[NUM_ELEMS]) {
  2. clock_t start = clock();
  3. __m128i _127 = _mm_set1_epi32(127);
  4. long long int result = 0;
  5. for(unsigned int w = 0; w < OUTER_ITERATIONS; w++) {
  6. /* COPY AND PASTE YOUR sum_simd() HERE */
  7. /* MODIFY IT BY UNROLLING IT */
  8. __m128i curr_sum = _mm_setzero_si128();
  9. for (unsigned int i = 0; i < NUM_ELEMS / 16 * 16; i += 16) {
  10. __m128i values = _mm_loadu_si128((__m128i*)(vals + i));
  11. __m128i mask = _mm_cmpgt_epi32(values, _127);
  12. values = _mm_and_si128(values, mask);
  13. curr_sum = _mm_add_epi32(curr_sum, values);
  14.  
  15. values = _mm_loadu_si128((__m128i*)(vals + i + 4));
  16. mask = _mm_cmpgt_epi32(values, _127);
  17. values = _mm_and_si128(values, mask);
  18. curr_sum = _mm_add_epi32(curr_sum, values);
  19.  
  20. values = _mm_loadu_si128((__m128i*)(vals + i + 8));
  21. mask = _mm_cmpgt_epi32(values, _127);
  22. values = _mm_and_si128(values, mask);
  23. curr_sum = _mm_add_epi32(curr_sum, values);
  24.  
  25. values = _mm_loadu_si128((__m128i*)(vals + i + 12));
  26. mask = _mm_cmpgt_epi32(values, _127);
  27. values = _mm_and_si128(values, mask);
  28. curr_sum = _mm_add_epi32(curr_sum, values);
  29. }
  30. int sums[4];
  31. _mm_storeu_si128((__m128i*) sums, curr_sum);
  32. for (int i = 0; i < 4; ++i) {
  33. result += sums[i];
  34. }
  35. /* You'll need a tail case. */
  36. for(unsigned int i = NUM_ELEMS / 16 * 16; i < NUM_ELEMS; i++) {
  37. if (vals[i] >= 128) {
  38. result += vals[i];
  39. }
  40. }
  41. /* You'll need 1 or maybe 2 tail cases here. */
  42. }
  43. clock_t end = clock();
  44. printf("Time taken: %Lf s\n", (long double)(end - start) / CLOCKS_PER_SEC);
  45. return result;
  46. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement