Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- long long int sum_simd_unrolled(unsigned int vals[NUM_ELEMS]) {
- clock_t start = clock();
- __m128i _127 = _mm_set1_epi32(127);
- long long int result = 0;
- for(unsigned int w = 0; w < OUTER_ITERATIONS; w++) {
- /* COPY AND PASTE YOUR sum_simd() HERE */
- /* MODIFY IT BY UNROLLING IT */
- __m128i curr_sum = _mm_setzero_si128();
- for (unsigned int i = 0; i < NUM_ELEMS / 16 * 16; i += 16) {
- __m128i values = _mm_loadu_si128((__m128i*)(vals + i));
- __m128i mask = _mm_cmpgt_epi32(values, _127);
- values = _mm_and_si128(values, mask);
- curr_sum = _mm_add_epi32(curr_sum, values);
- values = _mm_loadu_si128((__m128i*)(vals + i + 4));
- mask = _mm_cmpgt_epi32(values, _127);
- values = _mm_and_si128(values, mask);
- curr_sum = _mm_add_epi32(curr_sum, values);
- values = _mm_loadu_si128((__m128i*)(vals + i + 8));
- mask = _mm_cmpgt_epi32(values, _127);
- values = _mm_and_si128(values, mask);
- curr_sum = _mm_add_epi32(curr_sum, values);
- values = _mm_loadu_si128((__m128i*)(vals + i + 12));
- mask = _mm_cmpgt_epi32(values, _127);
- values = _mm_and_si128(values, mask);
- curr_sum = _mm_add_epi32(curr_sum, values);
- }
- int sums[4];
- _mm_storeu_si128((__m128i*) sums, curr_sum);
- for (int i = 0; i < 4; ++i) {
- result += sums[i];
- }
- /* You'll need a tail case. */
- for(unsigned int i = NUM_ELEMS / 16 * 16; i < NUM_ELEMS; i++) {
- if (vals[i] >= 128) {
- result += vals[i];
- }
- }
- /* You'll need 1 or maybe 2 tail cases here. */
- }
- clock_t end = clock();
- printf("Time taken: %Lf s\n", (long double)(end - start) / CLOCKS_PER_SEC);
- return result;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement