Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #ifdef YES_AVX
- void matvec_YMM(double* a, double* x, double* y, int n, int lb)
- {
- int i, j;
- __m256d rx0, ra0, ra1, ra2, ra3, ry0, ry1, ry2, ry3;
- double *ptr_x, *ptr_a;
- __declspec(align(16)) double buf0[4], buf1[4], buf2[4], buf3[4];
- memset((void *)y, 0, n * sizeof(double));
- ptr_a = a;
- for (i = 0; i < n; i += 4)
- {
- ry0 = ry1 = ry2 = ry3 = _mm256_setzero_pd();
- ptr_x = x;
- for (j = 0; j < n; j += 16)
- {
- _mm_prefetch((const char *)(ptr_x + 16), _MM_HINT_T0);
- _mm_prefetch((const char *)(ptr_x + 24), _MM_HINT_T0);
- _mm_prefetch((const char *)(ptr_a + 16), _MM_HINT_NTA);
- _mm_prefetch((const char *)(ptr_a + 24), _MM_HINT_NTA);
- _mm_prefetch((const char *)(ptr_a +n+ 16), _MM_HINT_NTA);
- _mm_prefetch((const char *)(ptr_a +n+ 24), _MM_HINT_NTA);
- _mm_prefetch((const char *)(ptr_a +2*n+ 16), _MM_HINT_NTA);
- _mm_prefetch((const char *)(ptr_a +2*n+ 24), _MM_HINT_NTA);
- _mm_prefetch((const char *)(ptr_a +3*n+ 16), _MM_HINT_NTA);
- _mm_prefetch((const char *)(ptr_a +3*n+ 24), _MM_HINT_NTA);
- //--------------------------0
- rx0 = _mm256_load_pd(ptr_x);
- ra0 = _mm256_load_pd(ptr_a);
- ra1 = _mm256_load_pd(ptr_a + n);
- ra2 = _mm256_load_pd(ptr_a + 2 * n);
- ra3 = _mm256_load_pd(ptr_a + 3 * n);
- ra0 = _mm256_mul_pd(ra0, rx0);
- ra1 = _mm256_mul_pd(ra1, rx0);
- ra2 = _mm256_mul_pd(ra2, rx0);
- ra3 = _mm256_mul_pd(ra3, rx0);
- ry0 = _mm256_add_pd(ry0, ra0);
- ry1 = _mm256_add_pd(ry1, ra1);
- ry2 = _mm256_add_pd(ry2, ra2);
- ry3 = _mm256_add_pd(ry3, ra3);
- //-------256----------------1
- rx0 = _mm256_load_pd(ptr_x + 4);
- ra0 = _mm256_load_pd(ptr_a + 4);
- ra1 = _mm256_load_pd(ptr_a + n + 4);
- ra2 = _mm256_load_pd(ptr_a + 2 * n + 4);
- ra3 = _mm256_load_pd(ptr_a + 3 * n + 4);
- ra0 = _mm256_mul_pd(ra0, rx0);
- ra1 = _mm256_mul_pd(ra1, rx0);
- ra2 = _mm256_mul_pd(ra2, rx0);
- ra3 = _mm256_mul_pd(ra3, rx0);
- ry0 = _mm256_add_pd(ry0, ra0);
- ry1 = _mm256_add_pd(ry1, ra1);
- ry2 = _mm256_add_pd(ry2, ra2);
- ry3 = _mm256_add_pd(ry3, ra3);
- //-------256----------------2
- rx0 = _mm256_load_pd(ptr_x + 8);
- ra0 = _mm256_load_pd(ptr_a + 8);
- ra1 = _mm256_load_pd(ptr_a + n + 8);
- ra2 = _mm256_load_pd(ptr_a + 2 * n + 8);
- ra3 = _mm256_load_pd(ptr_a + 3 * n + 8);
- ra0 = _mm256_mul_pd(ra0, rx0);
- ra1 = _mm256_mul_pd(ra1, rx0);
- ra2 = _mm256_mul_pd(ra2, rx0);
- ra3 = _mm256_mul_pd(ra3, rx0);
- ry0 = _mm256_add_pd(ry0, ra0);
- ry1 = _mm256_add_pd(ry1, ra1);
- ry2 = _mm256_add_pd(ry2, ra2);
- ry3 = _mm256_add_pd(ry3, ra3);
- //-------256----------------3
- rx0 = _mm256_load_pd(ptr_x + 12);
- ra0 = _mm256_load_pd(ptr_a + 12);
- ra1 = _mm256_load_pd(ptr_a + n + 12);
- ra2 = _mm256_load_pd(ptr_a + 2 * n + 12);
- ra3 = _mm256_load_pd(ptr_a + 3 * n + 12);
- ra0 = _mm256_mul_pd(ra0, rx0);
- ra1 = _mm256_mul_pd(ra1, rx0);
- ra2 = _mm256_mul_pd(ra2, rx0);
- ra3 = _mm256_mul_pd(ra3, rx0);
- ry0 = _mm256_add_pd(ry0, ra0);
- ry1 = _mm256_add_pd(ry1, ra1);
- ry2 = _mm256_add_pd(ry2, ra2);
- ry3 = _mm256_add_pd(ry3, ra3);
- ptr_a += 16;
- ptr_x += 16;
- }
- ptr_a += 3 * n;
- _mm256_store_pd(buf0, ry0);
- _mm256_store_pd(buf1, ry1);
- _mm256_store_pd(buf2, ry2);
- _mm256_store_pd(buf3, ry3);
- y[i] = buf0[0] + buf0[1] + buf0[2] + buf0[3];
- y[i + 1] = buf1[0] + buf1[1] + buf1[2] + buf1[3];
- y[i + 2] = buf2[0] + buf2[1] + buf2[2] + buf2[3];
- y[i + 3] = buf3[0] + buf3[1] + buf3[2] + buf3[3];
- }
- }
- #endif
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement