Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- int i, j;
- __m128d rx0, ra0, ra1, ra2, ra3, ry0, ry1, ry2, ry3;
- double* ptr_x, * ptr_a;
- __declspec(align(16)) double tmp0[2], tmp1[2], tmp2[2], tmp3[2];
- memset((void*)y, 0, n * sizeof(double));
- ptr_a = a;
- for (i = 0; i < n; i += 4) {
- ry0 = ry1 = ry2 = ry3 = _mm_setzero_pd();
- ptr_x = x;
- for (j = 0; j < n; j += 8, ptr_a += 8, ptr_x += 8) {
- rx0 = _mm_load_pd(ptr_x);
- ra0 = _mm_load_pd(ptr_a);
- ra1 = _mm_load_pd(ptr_a + n);
- ra2 = _mm_load_pd(ptr_a + 2 * n);
- ra3 = _mm_load_pd(ptr_a + 3 * n);
- ra0 = _mm_mul_pd(ra0, rx0);
- ry0 = _mm_add_pd(ry0, ra0);
- ra1 = _mm_mul_pd(ra1, rx0);
- ry1 = _mm_add_pd(ry1, ra1);
- ra2 = _mm_mul_pd(ra2, rx0);
- ry2 = _mm_add_pd(ry2, ra2);
- ra3 = _mm_mul_pd(ra3, rx0);
- ry3 = _mm_add_pd(ry3, ra3);
- rx0 = _mm_load_pd(ptr_x + 2);
- ra0 = _mm_load_pd(ptr_a + 2);
- ra1 = _mm_load_pd(ptr_a + n + 2);
- ra2 = _mm_load_pd(ptr_a + 2 * n + 2);
- ra3 = _mm_load_pd(ptr_a + 3 * n + 2);
- ra0 = _mm_mul_pd(ra0, rx0);
- ry0 = _mm_add_pd(ry0, ra0);
- ra1 = _mm_mul_pd(ra1, rx0);
- ry1 = _mm_add_pd(ry1, ra1);
- ra2 = _mm_mul_pd(ra2, rx0);
- ry2 = _mm_add_pd(ry2, ra2);
- ra3 = _mm_mul_pd(ra3, rx0);
- ry3 = _mm_add_pd(ry3, ra3);
- rx0 = _mm_load_pd(ptr_x + 4);
- ra0 = _mm_load_pd(ptr_a + 4);
- ra1 = _mm_load_pd(ptr_a + n + 4);
- ra2 = _mm_load_pd(ptr_a + 2 * n + 4);
- ra3 = _mm_load_pd(ptr_a + 3 * n + 4);
- ra0 = _mm_mul_pd(ra0, rx0);
- ry0 = _mm_add_pd(ry0, ra0);
- ra1 = _mm_mul_pd(ra1, rx0);
- ry1 = _mm_add_pd(ry1, ra1);
- ra2 = _mm_mul_pd(ra2, rx0);
- ry2 = _mm_add_pd(ry2, ra2);
- ra3 = _mm_mul_pd(ra3, rx0);
- ry3 = _mm_add_pd(ry3, ra3);
- rx0 = _mm_load_pd(ptr_x + 6);
- ra0 = _mm_load_pd(ptr_a + 6);
- ra1 = _mm_load_pd(ptr_a + n + 6);
- ra2 = _mm_load_pd(ptr_a + 2 * n + 6);
- ra3 = _mm_load_pd(ptr_a + 3 * n + 6);
- ra0 = _mm_mul_pd(ra0, rx0);
- ry0 = _mm_add_pd(ry0, ra0);
- ra1 = _mm_mul_pd(ra1, rx0);
- ry1 = _mm_add_pd(ry1, ra1);
- ra2 = _mm_mul_pd(ra2, rx0);
- ry2 = _mm_add_pd(ry2, ra2);
- ra3 = _mm_mul_pd(ra3, rx0);
- ry3 = _mm_add_pd(ry3, ra3);
- }
- ptr_a += 3 * n;
- _mm_store_pd(tmp0, ry0);
- _mm_store_pd(tmp1, ry1);
- _mm_store_pd(tmp2, ry2);
- _mm_store_pd(tmp3, ry3);
- y[i] = tmp0[0] + tmp0[1];
- y[i + 1] = tmp1[0] + tmp1[1];
- y[i + 2] = tmp2[0] + tmp2[1];
- y[i + 3] = tmp3[0] + tmp3[1];
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement