Advertisement
kubpica

Untitled

Apr 5th, 2020
256
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 6.83 KB | None | 0 0
  1. ////////////////////////////////////////////////////////////////////////////////////////////////////
  2. //   mvec.cpp
  3.  
  4. #include "stdafx.h"
  5. #include <iostream>
  6. #include "mvec.h"
  7. #include <emmintrin.h>
  8. #include <immintrin.h>
  9.  
  10. using namespace std;
  11.  
  12. void mult_naive(double *a, double *x, double *y, int n)
  13. {
  14.     int i, j, ij;
  15.     double register reg;
  16.  
  17.     for(i=0, ij=0; i<n; ++i)
  18.     {
  19.         reg = 0;
  20.  
  21.         for(j=0; j<n; ++j, ++ij)
  22.         {
  23.             reg += a[ij]*x[j];
  24.         }
  25.  
  26.         y[i] = reg;
  27.     }
  28. }
  29.  
  30. void matvec_XMM(double* a, double* x, double* y, int n, int lb)
  31. {
  32.     //napisz swoj kod
  33.     int i, j;
  34.     __m128d rx0, ra0, ry0;
  35.     double* ptr_x, * ptr_a, * tmp = new double[2];
  36.     memset((void*)y, 0, n * sizeof(double));
  37.     ptr_a = a;
  38.  
  39.     for (i = 0; i < n; i++) {
  40.         ry0 = _mm_setzero_pd();
  41.         ptr_x = x;
  42.         for (j = 0; j < n; j += 2, ptr_a += 2, ptr_x += 2) {
  43.             rx0 = _mm_load_pd(ptr_x);
  44.             ra0 = _mm_load_pd(ptr_a);
  45.             ra0 = _mm_mul_pd(ra0, rx0);
  46.             ry0 = _mm_add_pd(ry0, ra0);
  47.         }
  48.         _mm_store_pd(tmp, ry0);
  49.         y[i] = tmp[0] + tmp[1];
  50.     }
  51.  
  52. }
  53.  
  54. #ifdef YES_AVX
  55. void matvec_YMM(double* a, double* x, double* y, int n, int lb)
  56. {
  57.     //napisz swoj kod
  58.     int i, j;
  59.     __m128d rx0, ra0, ra1, ra2, ra3, ry0, ry1, ry2, ry3;
  60.     double* ptr_x, * ptr_a;
  61.     __declspec(align(16)) double tmp0[2], tmp1[2], tmp2[2], tmp3[2];
  62.     memset((void*)y, 0, n * sizeof(double));
  63.     ptr_a = a;
  64.  
  65.     for (i = 0; i < n; i += 4) {
  66.         ry0 = ry1 = ry2 = ry3 = _mm_setzero_pd();
  67.         ptr_x = x;
  68.         for (j = 0; j < n; j += 8, ptr_a += 8, ptr_x += 8) {
  69.             rx0 = _mm_load_pd(ptr_x);
  70.             ra0 = _mm_load_pd(ptr_a);
  71.             ra1 = _mm_load_pd(ptr_a + n);
  72.             ra2 = _mm_load_pd(ptr_a + 2 * n);
  73.             ra3 = _mm_load_pd(ptr_a + 3 * n);
  74.             ra0 = _mm_mul_pd(ra0, rx0);
  75.             ry0 = _mm_add_pd(ry0, ra0);
  76.             ra1 = _mm_mul_pd(ra1, rx0);
  77.             ry1 = _mm_add_pd(ry1, ra1);
  78.             ra2 = _mm_mul_pd(ra2, rx0);
  79.             ry2 = _mm_add_pd(ry2, ra2);
  80.             ra3 = _mm_mul_pd(ra3, rx0);
  81.             ry3 = _mm_add_pd(ry3, ra3);
  82.             rx0 = _mm_load_pd(ptr_x + 2);
  83.             ra0 = _mm_load_pd(ptr_a + 2);
  84.             ra1 = _mm_load_pd(ptr_a + n + 2);
  85.             ra2 = _mm_load_pd(ptr_a + 2 * n + 2);
  86.             ra3 = _mm_load_pd(ptr_a + 3 * n + 2);
  87.             ra0 = _mm_mul_pd(ra0, rx0);
  88.             ry0 = _mm_add_pd(ry0, ra0);
  89.             ra1 = _mm_mul_pd(ra1, rx0);
  90.             ry1 = _mm_add_pd(ry1, ra1);
  91.             ra2 = _mm_mul_pd(ra2, rx0);
  92.             ry2 = _mm_add_pd(ry2, ra2);
  93.             ra3 = _mm_mul_pd(ra3, rx0);
  94.             ry3 = _mm_add_pd(ry3, ra3);
  95.             rx0 = _mm_load_pd(ptr_x + 4);
  96.             ra0 = _mm_load_pd(ptr_a + 4);
  97.             ra1 = _mm_load_pd(ptr_a + n + 4);
  98.             ra2 = _mm_load_pd(ptr_a + 2 * n + 4);
  99.             ra3 = _mm_load_pd(ptr_a + 3 * n + 4);
  100.             ra0 = _mm_mul_pd(ra0, rx0);
  101.             ry0 = _mm_add_pd(ry0, ra0);
  102.             ra1 = _mm_mul_pd(ra1, rx0);
  103.             ry1 = _mm_add_pd(ry1, ra1);
  104.             ra2 = _mm_mul_pd(ra2, rx0);
  105.             ry2 = _mm_add_pd(ry2, ra2);
  106.             ra3 = _mm_mul_pd(ra3, rx0);
  107.             ry3 = _mm_add_pd(ry3, ra3);
  108.             rx0 = _mm_load_pd(ptr_x + 6);
  109.             ra0 = _mm_load_pd(ptr_a + 6);
  110.             ra1 = _mm_load_pd(ptr_a + n + 6);
  111.             ra2 = _mm_load_pd(ptr_a + 2 * n + 6);
  112.             ra3 = _mm_load_pd(ptr_a + 3 * n + 6);
  113.             ra0 = _mm_mul_pd(ra0, rx0);
  114.             ry0 = _mm_add_pd(ry0, ra0);
  115.             ra1 = _mm_mul_pd(ra1, rx0);
  116.             ry1 = _mm_add_pd(ry1, ra1);
  117.             ra2 = _mm_mul_pd(ra2, rx0);
  118.             ry2 = _mm_add_pd(ry2, ra2);
  119.             ra3 = _mm_mul_pd(ra3, rx0);
  120.             ry3 = _mm_add_pd(ry3, ra3);
  121.         }
  122.  
  123.         ptr_a += 3 * n;
  124.         _mm_store_pd(tmp0, ry0);
  125.         _mm_store_pd(tmp1, ry1);
  126.         _mm_store_pd(tmp2, ry2);
  127.         _mm_store_pd(tmp3, ry3);
  128.         y[i] = tmp0[0] + tmp0[1];
  129.         y[i + 1] = tmp1[0] + tmp1[1];
  130.         y[i + 2] = tmp2[0] + tmp2[1];
  131.         y[i + 3] = tmp3[0] + tmp3[1];
  132.     }
  133.  
  134. }
  135. #endif
  136.  
  137. #ifdef YES_FMA
  138. void matvec_FMA(double* a, double* x, double* y, int n, int lb)
  139. {
  140.     memset((void*)y, 0, n * sizeof(double));
  141.     double* ptr_a = a;
  142.     double* ptr_x = NULL;
  143.     const int vectorSize = 4;
  144.     const int outerStep = 4;
  145.     const int innerStep = 16;
  146.     __m256d ra0, ra1, ra2, ra3;
  147.     __m256d rx0, rx1, rx2, rx3;
  148.     __m256d ry0, ry1, ry2, ry3;
  149.     __declspec(align(32)) double buf0[vectorSize];
  150.     __declspec(align(32)) double buf1[vectorSize];
  151.     __declspec(align(32)) double buf2[vectorSize];
  152.     __declspec(align(32)) double buf3[vectorSize];
  153.  
  154.     for (int i = 0; i < n; i += outerStep, ptr_a += (outerStep - 1) * n) {
  155.         ry0 = ry1 = ry2 = ry3 = _mm256_setzero_pd();
  156.         ptr_x = x;
  157.         for (int j = 0; j < n; j += innerStep, ptr_a += innerStep, ptr_x += innerStep) {
  158.             _mm_prefetch((char*)(ptr_x + innerStep), _MM_HINT_T0);
  159.             _mm_prefetch((char*)(ptr_x + innerStep + 8), _MM_HINT_T0);
  160.             _mm_prefetch((char*)(ptr_a + innerStep), _MM_HINT_NTA);
  161.             _mm_prefetch((char*)(ptr_a + innerStep + 8), _MM_HINT_NTA);
  162.             _mm_prefetch((char*)(ptr_a + n + innerStep), _MM_HINT_NTA);
  163.             _mm_prefetch((char*)(ptr_a + n + innerStep + 8), _MM_HINT_NTA);
  164.             _mm_prefetch((char*)(ptr_a + 2 * n + innerStep), _MM_HINT_NTA);
  165.             _mm_prefetch((char*)(ptr_a + 2 * n + innerStep + 8), _MM_HINT_NTA);
  166.             _mm_prefetch((char*)(ptr_a + 3 * n + innerStep), _MM_HINT_NTA);
  167.             _mm_prefetch((char*)(ptr_a + 3 * n + innerStep + 8), _MM_HINT_NTA);
  168.             rx0 = _mm256_load_pd(ptr_x);
  169.             rx1 = _mm256_load_pd(ptr_x + vectorSize);
  170.             rx2 = _mm256_load_pd(ptr_x + 2 * vectorSize);
  171.             rx3 = _mm256_load_pd(ptr_x + 3 * vectorSize);
  172.             ra0 = _mm256_load_pd(ptr_a);
  173.             ra1 = _mm256_load_pd(ptr_a + n);
  174.             ra2 = _mm256_load_pd(ptr_a + 2 * n);
  175.             ra3 = _mm256_load_pd(ptr_a + 3 * n);
  176.             ry0 = _mm256_fmadd_pd(ra0, rx0, ry0);
  177.             ry1 = _mm256_fmadd_pd(ra1, rx0, ry1);
  178.             ry2 = _mm256_fmadd_pd(ra2, rx0, ry2);
  179.             ry3 = _mm256_fmadd_pd(ra3, rx0, ry3);
  180.             ra0 = _mm256_load_pd(ptr_a + vectorSize);
  181.             ra1 = _mm256_load_pd(ptr_a + n + vectorSize);
  182.             ra2 = _mm256_load_pd(ptr_a + 2 * n + vectorSize);
  183.             ra3 = _mm256_load_pd(ptr_a + 3 * n + vectorSize);
  184.             ry0 = _mm256_fmadd_pd(ra0, rx1, ry0);
  185.             ry1 = _mm256_fmadd_pd(ra1, rx1, ry1);
  186.             ry2 = _mm256_fmadd_pd(ra2, rx1, ry2);
  187.             ry3 = _mm256_fmadd_pd(ra3, rx1, ry3);
  188.             ra0 = _mm256_load_pd(ptr_a + 2 * vectorSize);
  189.             ra1 = _mm256_load_pd(ptr_a + n + 2 * vectorSize);
  190.             ra2 = _mm256_load_pd(ptr_a + 2 * n + 2 * vectorSize);
  191.             ra3 = _mm256_load_pd(ptr_a + 3 * n + 2 * vectorSize);
  192.             ry0 = _mm256_fmadd_pd(ra0, rx2, ry0);
  193.             ry1 = _mm256_fmadd_pd(ra1, rx2, ry1);
  194.             ry2 = _mm256_fmadd_pd(ra2, rx2, ry2);
  195.             ry3 = _mm256_fmadd_pd(ra3, rx2, ry3);
  196.             ra0 = _mm256_load_pd(ptr_a + 3 * vectorSize);
  197.             ra1 = _mm256_load_pd(ptr_a + n + 3 * vectorSize);
  198.             ra2 = _mm256_load_pd(ptr_a + 2 * n + 3 * vectorSize);
  199.             ra3 = _mm256_load_pd(ptr_a + 3 * n + 3 * vectorSize);
  200.             ry0 = _mm256_fmadd_pd(ra0, rx3, ry0);
  201.             ry1 = _mm256_fmadd_pd(ra1, rx3, ry1);
  202.             ry2 = _mm256_fmadd_pd(ra2, rx3, ry2);
  203.             ry3 = _mm256_fmadd_pd(ra3, rx3, ry3);
  204.         }
  205.         _mm256_store_pd(buf0, ry0);
  206.         _mm256_store_pd(buf1, ry1);
  207.         _mm256_store_pd(buf2, ry2);
  208.         _mm256_store_pd(buf3, ry3);
  209.         y[i] = buf0[0] + buf0[1] + buf0[2] + buf0[3];
  210.         y[i + 1] = buf1[0] + buf1[1] + buf1[2] + buf1[3];
  211.         y[i + 2] = buf2[0] + buf2[1] + buf2[2] + buf2[3];
  212.         y[i + 3] = buf3[0] + buf3[1] + buf3[2] + buf3[3];
  213.     }
  214.     ptr_a = ptr_x = NULL;
  215. }
  216. #endif
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement