Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ////////////////////////////////////////////////////////////////////////////////////////////////////
- // mvec.cpp
- #include "stdafx.h"
- #include <iostream>
- #include "mvec.h"
- #include <emmintrin.h>
- #include <immintrin.h>
- using namespace std;
- void mult_naive(double *a, double *x, double *y, int n)
- {
- int i, j, ij;
- double register reg;
- for(i=0, ij=0; i<n; ++i)
- {
- reg = 0;
- for(j=0; j<n; ++j, ++ij)
- {
- reg += a[ij]*x[j];
- }
- y[i] = reg;
- }
- }
- void matvec_XMM(double* a, double* x, double* y, int n, int lb)
- {
- //napisz swoj kod
- int i, j;
- __m128d rx0, ra0, ry0;
- double* ptr_x, * ptr_a, * tmp = new double[2];
- memset((void*)y, 0, n * sizeof(double));
- ptr_a = a;
- for (i = 0; i < n; i++) {
- ry0 = _mm_setzero_pd();
- ptr_x = x;
- for (j = 0; j < n; j += 2, ptr_a += 2, ptr_x += 2) {
- rx0 = _mm_load_pd(ptr_x);
- ra0 = _mm_load_pd(ptr_a);
- ra0 = _mm_mul_pd(ra0, rx0);
- ry0 = _mm_add_pd(ry0, ra0);
- }
- _mm_store_pd(tmp, ry0);
- y[i] = tmp[0] + tmp[1];
- }
- }
- #ifdef YES_AVX
- void matvec_YMM(double* a, double* x, double* y, int n, int lb)
- {
- //napisz swoj kod
- int i, j;
- __m128d rx0, ra0, ra1, ra2, ra3, ry0, ry1, ry2, ry3;
- double* ptr_x, * ptr_a;
- __declspec(align(16)) double tmp0[2], tmp1[2], tmp2[2], tmp3[2];
- memset((void*)y, 0, n * sizeof(double));
- ptr_a = a;
- for (i = 0; i < n; i += 4) {
- ry0 = ry1 = ry2 = ry3 = _mm_setzero_pd();
- ptr_x = x;
- for (j = 0; j < n; j += 8, ptr_a += 8, ptr_x += 8) {
- rx0 = _mm_load_pd(ptr_x);
- ra0 = _mm_load_pd(ptr_a);
- ra1 = _mm_load_pd(ptr_a + n);
- ra2 = _mm_load_pd(ptr_a + 2 * n);
- ra3 = _mm_load_pd(ptr_a + 3 * n);
- ra0 = _mm_mul_pd(ra0, rx0);
- ry0 = _mm_add_pd(ry0, ra0);
- ra1 = _mm_mul_pd(ra1, rx0);
- ry1 = _mm_add_pd(ry1, ra1);
- ra2 = _mm_mul_pd(ra2, rx0);
- ry2 = _mm_add_pd(ry2, ra2);
- ra3 = _mm_mul_pd(ra3, rx0);
- ry3 = _mm_add_pd(ry3, ra3);
- rx0 = _mm_load_pd(ptr_x + 2);
- ra0 = _mm_load_pd(ptr_a + 2);
- ra1 = _mm_load_pd(ptr_a + n + 2);
- ra2 = _mm_load_pd(ptr_a + 2 * n + 2);
- ra3 = _mm_load_pd(ptr_a + 3 * n + 2);
- ra0 = _mm_mul_pd(ra0, rx0);
- ry0 = _mm_add_pd(ry0, ra0);
- ra1 = _mm_mul_pd(ra1, rx0);
- ry1 = _mm_add_pd(ry1, ra1);
- ra2 = _mm_mul_pd(ra2, rx0);
- ry2 = _mm_add_pd(ry2, ra2);
- ra3 = _mm_mul_pd(ra3, rx0);
- ry3 = _mm_add_pd(ry3, ra3);
- rx0 = _mm_load_pd(ptr_x + 4);
- ra0 = _mm_load_pd(ptr_a + 4);
- ra1 = _mm_load_pd(ptr_a + n + 4);
- ra2 = _mm_load_pd(ptr_a + 2 * n + 4);
- ra3 = _mm_load_pd(ptr_a + 3 * n + 4);
- ra0 = _mm_mul_pd(ra0, rx0);
- ry0 = _mm_add_pd(ry0, ra0);
- ra1 = _mm_mul_pd(ra1, rx0);
- ry1 = _mm_add_pd(ry1, ra1);
- ra2 = _mm_mul_pd(ra2, rx0);
- ry2 = _mm_add_pd(ry2, ra2);
- ra3 = _mm_mul_pd(ra3, rx0);
- ry3 = _mm_add_pd(ry3, ra3);
- rx0 = _mm_load_pd(ptr_x + 6);
- ra0 = _mm_load_pd(ptr_a + 6);
- ra1 = _mm_load_pd(ptr_a + n + 6);
- ra2 = _mm_load_pd(ptr_a + 2 * n + 6);
- ra3 = _mm_load_pd(ptr_a + 3 * n + 6);
- ra0 = _mm_mul_pd(ra0, rx0);
- ry0 = _mm_add_pd(ry0, ra0);
- ra1 = _mm_mul_pd(ra1, rx0);
- ry1 = _mm_add_pd(ry1, ra1);
- ra2 = _mm_mul_pd(ra2, rx0);
- ry2 = _mm_add_pd(ry2, ra2);
- ra3 = _mm_mul_pd(ra3, rx0);
- ry3 = _mm_add_pd(ry3, ra3);
- }
- ptr_a += 3 * n;
- _mm_store_pd(tmp0, ry0);
- _mm_store_pd(tmp1, ry1);
- _mm_store_pd(tmp2, ry2);
- _mm_store_pd(tmp3, ry3);
- y[i] = tmp0[0] + tmp0[1];
- y[i + 1] = tmp1[0] + tmp1[1];
- y[i + 2] = tmp2[0] + tmp2[1];
- y[i + 3] = tmp3[0] + tmp3[1];
- }
- }
- #endif
- #ifdef YES_FMA
- void matvec_FMA(double* a, double* x, double* y, int n, int lb)
- {
- memset((void*)y, 0, n * sizeof(double));
- double* ptr_a = a;
- double* ptr_x = NULL;
- const int vectorSize = 4;
- const int outerStep = 4;
- const int innerStep = 16;
- __m256d ra0, ra1, ra2, ra3;
- __m256d rx0, rx1, rx2, rx3;
- __m256d ry0, ry1, ry2, ry3;
- __declspec(align(32)) double buf0[vectorSize];
- __declspec(align(32)) double buf1[vectorSize];
- __declspec(align(32)) double buf2[vectorSize];
- __declspec(align(32)) double buf3[vectorSize];
- for (int i = 0; i < n; i += outerStep, ptr_a += (outerStep - 1) * n) {
- ry0 = ry1 = ry2 = ry3 = _mm256_setzero_pd();
- ptr_x = x;
- for (int j = 0; j < n; j += innerStep, ptr_a += innerStep, ptr_x += innerStep) {
- _mm_prefetch((char*)(ptr_x + innerStep), _MM_HINT_T0);
- _mm_prefetch((char*)(ptr_x + innerStep + 8), _MM_HINT_T0);
- _mm_prefetch((char*)(ptr_a + innerStep), _MM_HINT_NTA);
- _mm_prefetch((char*)(ptr_a + innerStep + 8), _MM_HINT_NTA);
- _mm_prefetch((char*)(ptr_a + n + innerStep), _MM_HINT_NTA);
- _mm_prefetch((char*)(ptr_a + n + innerStep + 8), _MM_HINT_NTA);
- _mm_prefetch((char*)(ptr_a + 2 * n + innerStep), _MM_HINT_NTA);
- _mm_prefetch((char*)(ptr_a + 2 * n + innerStep + 8), _MM_HINT_NTA);
- _mm_prefetch((char*)(ptr_a + 3 * n + innerStep), _MM_HINT_NTA);
- _mm_prefetch((char*)(ptr_a + 3 * n + innerStep + 8), _MM_HINT_NTA);
- rx0 = _mm256_load_pd(ptr_x);
- rx1 = _mm256_load_pd(ptr_x + vectorSize);
- rx2 = _mm256_load_pd(ptr_x + 2 * vectorSize);
- rx3 = _mm256_load_pd(ptr_x + 3 * vectorSize);
- ra0 = _mm256_load_pd(ptr_a);
- ra1 = _mm256_load_pd(ptr_a + n);
- ra2 = _mm256_load_pd(ptr_a + 2 * n);
- ra3 = _mm256_load_pd(ptr_a + 3 * n);
- ry0 = _mm256_fmadd_pd(ra0, rx0, ry0);
- ry1 = _mm256_fmadd_pd(ra1, rx0, ry1);
- ry2 = _mm256_fmadd_pd(ra2, rx0, ry2);
- ry3 = _mm256_fmadd_pd(ra3, rx0, ry3);
- ra0 = _mm256_load_pd(ptr_a + vectorSize);
- ra1 = _mm256_load_pd(ptr_a + n + vectorSize);
- ra2 = _mm256_load_pd(ptr_a + 2 * n + vectorSize);
- ra3 = _mm256_load_pd(ptr_a + 3 * n + vectorSize);
- ry0 = _mm256_fmadd_pd(ra0, rx1, ry0);
- ry1 = _mm256_fmadd_pd(ra1, rx1, ry1);
- ry2 = _mm256_fmadd_pd(ra2, rx1, ry2);
- ry3 = _mm256_fmadd_pd(ra3, rx1, ry3);
- ra0 = _mm256_load_pd(ptr_a + 2 * vectorSize);
- ra1 = _mm256_load_pd(ptr_a + n + 2 * vectorSize);
- ra2 = _mm256_load_pd(ptr_a + 2 * n + 2 * vectorSize);
- ra3 = _mm256_load_pd(ptr_a + 3 * n + 2 * vectorSize);
- ry0 = _mm256_fmadd_pd(ra0, rx2, ry0);
- ry1 = _mm256_fmadd_pd(ra1, rx2, ry1);
- ry2 = _mm256_fmadd_pd(ra2, rx2, ry2);
- ry3 = _mm256_fmadd_pd(ra3, rx2, ry3);
- ra0 = _mm256_load_pd(ptr_a + 3 * vectorSize);
- ra1 = _mm256_load_pd(ptr_a + n + 3 * vectorSize);
- ra2 = _mm256_load_pd(ptr_a + 2 * n + 3 * vectorSize);
- ra3 = _mm256_load_pd(ptr_a + 3 * n + 3 * vectorSize);
- ry0 = _mm256_fmadd_pd(ra0, rx3, ry0);
- ry1 = _mm256_fmadd_pd(ra1, rx3, ry1);
- ry2 = _mm256_fmadd_pd(ra2, rx3, ry2);
- ry3 = _mm256_fmadd_pd(ra3, rx3, ry3);
- }
- _mm256_store_pd(buf0, ry0);
- _mm256_store_pd(buf1, ry1);
- _mm256_store_pd(buf2, ry2);
- _mm256_store_pd(buf3, ry3);
- y[i] = buf0[0] + buf0[1] + buf0[2] + buf0[3];
- y[i + 1] = buf1[0] + buf1[1] + buf1[2] + buf1[3];
- y[i + 2] = buf2[0] + buf2[1] + buf2[2] + buf2[3];
- y[i + 3] = buf3[0] + buf3[1] + buf3[2] + buf3[3];
- }
- ptr_a = ptr_x = NULL;
- }
- #endif
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement