Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <stdlib.h>
- #include <sys/time.h>
- #include <time.h>
- #include <papi.h>
- #include <time.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <mmintrin.h>
- #include <xmmintrin.h> // SSE
- #include <pmmintrin.h> // SSE2
- #include <emmintrin.h> // SSE3
- #define T double
- T* make_matrix(int n)
- {
- T* mat = (T*)malloc(sizeof(T)*n*n);
- for (int i = 0; i < n*n; ++i)
- {
- mat[i] = rand() / RAND_MAX;
- }
- return mat;
- }
- void destroy_matrix(T* m)
- {
- free(m);
- }
- void mm0(T* lhs, T* rhs, T* result, int n)
- {
- for (int i = 0; i < n; ++i)
- {
- for (int j = 0; j < n; ++j)
- {
- T sum = 0;
- for (int k = 0; k < n; ++k)
- {
- sum += lhs[i*n+k] * rhs[k*n+j];
- }
- result[i*n+j] = sum;
- }
- }
- }
- void mm1(T* lhs, T* rhs, T* result, int n)
- {
- for (int i = n; --i;)
- {
- for (int j = n; --j;)
- {
- T sum = 0;
- for (int k = n; --k;)
- {
- sum += lhs[i*n+k] * rhs[k*n+j];
- }
- result[i*n+j] = sum;
- }
- }
- }
- void mm2(T* lhs, T* rhs, T* result, int n)
- {
- for (int i = n; --i;)
- {
- for (int j = n; --j;)
- {
- T sum = 0;
- int k = 0;
- for (; k < n - 7; k += 8)
- {
- sum += lhs[i*n+k] * rhs[k*n+j];
- sum += lhs[i*n+k+1] * rhs[(k+1)*n+j];
- sum += lhs[i*n+k+2] * rhs[(k+2)*n+j];
- sum += lhs[i*n+k+3] * rhs[(k+3)*n+j];
- sum += lhs[i*n+k+4] * rhs[(k+4)*n+j];
- sum += lhs[i*n+k+5] * rhs[(k+5)*n+j];
- sum += lhs[i*n+k+6] * rhs[(k+6)*n+j];
- sum += lhs[i*n+k+7] * rhs[(k+7)*n+j];
- }
- for (; k < n; ++k)
- {
- sum += lhs[i*n+k] * rhs[k*n+j];
- }
- result[i*n+j] = sum;
- }
- }
- }
- void mm3(T* lhs, T* rhs, T* result, int n)
- {
- for (int i = n; --i;)
- {
- for (int j = n; --j;)
- {
- T sum = 0;
- int k = 0;
- for (; k < n - 15; k += 16)
- {
- sum += lhs[i*n+k] * rhs[k*n+j];
- sum += lhs[i*n+k+1] * rhs[(k+1)*n+j];
- sum += lhs[i*n+k+2] * rhs[(k+2)*n+j];
- sum += lhs[i*n+k+3] * rhs[(k+3)*n+j];
- sum += lhs[i*n+k+4] * rhs[(k+4)*n+j];
- sum += lhs[i*n+k+5] * rhs[(k+5)*n+j];
- sum += lhs[i*n+k+6] * rhs[(k+6)*n+j];
- sum += lhs[i*n+k+7] * rhs[(k+7)*n+j];
- sum += lhs[i*n+k+8] * rhs[(k+8)*n+j];
- sum += lhs[i*n+k+9] * rhs[(k+9)*n+j];
- sum += lhs[i*n+k+10] * rhs[(k+10)*n+j];
- sum += lhs[i*n+k+11] * rhs[(k+11)*n+j];
- sum += lhs[i*n+k+12] * rhs[(k+12)*n+j];
- sum += lhs[i*n+k+13] * rhs[(k+13)*n+j];
- sum += lhs[i*n+k+14] * rhs[(k+14)*n+j];
- sum += lhs[i*n+k+15] * rhs[(k+15)*n+j];
- }
- for (; k < n; ++k)
- {
- sum += lhs[i*n+k] * rhs[k*n+j];
- }
- result[i*n+j] = sum;
- }
- }
- }
- void mm4(T* lhs, T* rhs, T* result, int n)
- {
- for (int i = n; --i;)
- {
- for (int j = n; --j;)
- {
- T sum = 0;
- int k = 0;
- for (; k < n - 31; k += 32)
- {
- sum += lhs[i*n+k] * rhs[k*n+j];
- sum += lhs[i*n+k+1] * rhs[(k+1)*n+j];
- sum += lhs[i*n+k+2] * rhs[(k+2)*n+j];
- sum += lhs[i*n+k+3] * rhs[(k+3)*n+j];
- sum += lhs[i*n+k+4] * rhs[(k+4)*n+j];
- sum += lhs[i*n+k+5] * rhs[(k+5)*n+j];
- sum += lhs[i*n+k+6] * rhs[(k+6)*n+j];
- sum += lhs[i*n+k+7] * rhs[(k+7)*n+j];
- sum += lhs[i*n+k+8] * rhs[(k+8)*n+j];
- sum += lhs[i*n+k+9] * rhs[(k+9)*n+j];
- sum += lhs[i*n+k+10] * rhs[(k+10)*n+j];
- sum += lhs[i*n+k+11] * rhs[(k+11)*n+j];
- sum += lhs[i*n+k+12] * rhs[(k+12)*n+j];
- sum += lhs[i*n+k+13] * rhs[(k+13)*n+j];
- sum += lhs[i*n+k+14] * rhs[(k+14)*n+j];
- sum += lhs[i*n+k+15] * rhs[(k+15)*n+j];
- sum += lhs[i*n+k+16] * rhs[(k+16)*n+j];
- sum += lhs[i*n+k+17] * rhs[(k+17)*n+j];
- sum += lhs[i*n+k+18] * rhs[(k+18)*n+j];
- sum += lhs[i*n+k+19] * rhs[(k+19)*n+j];
- sum += lhs[i*n+k+20] * rhs[(k+20)*n+j];
- sum += lhs[i*n+k+21] * rhs[(k+21)*n+j];
- sum += lhs[i*n+k+22] * rhs[(k+22)*n+j];
- sum += lhs[i*n+k+23] * rhs[(k+23)*n+j];
- sum += lhs[i*n+k+24] * rhs[(k+24)*n+j];
- sum += lhs[i*n+k+25] * rhs[(k+25)*n+j];
- sum += lhs[i*n+k+26] * rhs[(k+26)*n+j];
- sum += lhs[i*n+k+27] * rhs[(k+27)*n+j];
- sum += lhs[i*n+k+28] * rhs[(k+28)*n+j];
- sum += lhs[i*n+k+29] * rhs[(k+29)*n+j];
- sum += lhs[i*n+k+30] * rhs[(k+30)*n+j];
- sum += lhs[i*n+k+31] * rhs[(k+31)*n+j];
- }
- for (; k < n; ++k)
- {
- sum += lhs[i*n+k] * rhs[k*n+j];
- }
- result[i*n+j] = sum;
- }
- }
- }
- void transpose(T* from, T* to, int n)
- {
- for (int i = 0; i < n; ++i)
- {
- for (int j = 0; j < n; ++j)
- {
- to[i*n+j] = from[i+j*n];
- }
- }
- }
- void mm5(T* lhs, T* rhs, T* result, int n)
- {
- T* rhs_t = make_matrix(n);
- transpose(rhs, rhs_t, n);
- for (int i = n; --i;)
- {
- for (int j = n; --j;)
- {
- T sum = 0;
- int k = 0;
- for (; k < n - 31; k += 32)
- {
- sum += lhs[i*n+k] * rhs_t[k+j*n];
- sum += lhs[i*n+k+1] * rhs_t[(k+1)+j*n];
- sum += lhs[i*n+k+2] * rhs_t[(k+2)+j*n];
- sum += lhs[i*n+k+3] * rhs_t[(k+3)+j*n];
- sum += lhs[i*n+k+4] * rhs_t[(k+4)+j*n];
- sum += lhs[i*n+k+5] * rhs_t[(k+5)+j*n];
- sum += lhs[i*n+k+6] * rhs_t[(k+6)+j*n];
- sum += lhs[i*n+k+7] * rhs_t[(k+7)+j*n];
- sum += lhs[i*n+k+8] * rhs_t[(k+8)+j*n];
- sum += lhs[i*n+k+9] * rhs_t[(k+9)+j*n];
- sum += lhs[i*n+k+10] * rhs_t[(k+10)+j*n];
- sum += lhs[i*n+k+11] * rhs_t[(k+11)+j*n];
- sum += lhs[i*n+k+12] * rhs_t[(k+12)+j*n];
- sum += lhs[i*n+k+13] * rhs_t[(k+13)+j*n];
- sum += lhs[i*n+k+14] * rhs_t[(k+14)+j*n];
- sum += lhs[i*n+k+15] * rhs_t[(k+15)+j*n];
- sum += lhs[i*n+k+16] * rhs_t[(k+16)+j*n];
- sum += lhs[i*n+k+17] * rhs_t[(k+17)+j*n];
- sum += lhs[i*n+k+18] * rhs_t[(k+18)+j*n];
- sum += lhs[i*n+k+19] * rhs_t[(k+19)+j*n];
- sum += lhs[i*n+k+20] * rhs_t[(k+20)+j*n];
- sum += lhs[i*n+k+21] * rhs_t[(k+21)+j*n];
- sum += lhs[i*n+k+22] * rhs_t[(k+22)+j*n];
- sum += lhs[i*n+k+23] * rhs_t[(k+23)+j*n];
- sum += lhs[i*n+k+24] * rhs_t[(k+24)+j*n];
- sum += lhs[i*n+k+25] * rhs_t[(k+25)+j*n];
- sum += lhs[i*n+k+26] * rhs_t[(k+26)+j*n];
- sum += lhs[i*n+k+27] * rhs_t[(k+27)+j*n];
- sum += lhs[i*n+k+28] * rhs_t[(k+28)+j*n];
- sum += lhs[i*n+k+29] * rhs_t[(k+29)+j*n];
- sum += lhs[i*n+k+30] * rhs_t[(k+30)+j*n];
- sum += lhs[i*n+k+31] * rhs_t[(k+31)+j*n];
- }
- for (; k < n; ++k)
- {
- sum += lhs[i*n+k] * rhs_t[k+j*n];
- }
- result[i*n+j] = sum;
- }
- }
- destroy_matrix(rhs_t);
- }
- void mm6(T* lhs, T* rhs, T* result, int n)
- {
- T* rhs_t = make_matrix(n);
- transpose(rhs, rhs_t, n);
- for (int i = n; --i;)
- {
- for (int j = n; --j;)
- {
- T sum0 = 0;
- T sum1 = 0;
- T sum2 = 0;
- T sum3 = 0;
- int k = 0;
- for (; k < n - 7; k += 8)
- {
- sum0 += lhs[i*n+k] * rhs_t[k+j*n];
- sum1 += lhs[i*n+k+1] * rhs_t[(k+1)+j*n];
- sum2 += lhs[i*n+k+2] * rhs_t[(k+2)+j*n];
- sum3 += lhs[i*n+k+3] * rhs_t[(k+3)+j*n];
- sum0 += lhs[i*n+k+4] * rhs_t[(k+4)+j*n];
- sum1 += lhs[i*n+k+5] * rhs_t[(k+5)+j*n];
- sum2 += lhs[i*n+k+6] * rhs_t[(k+6)+j*n];
- sum3 += lhs[i*n+k+7] * rhs_t[(k+7)+j*n];
- }
- for (; k < n; ++k)
- {
- sum0 += lhs[i*n+k] * rhs_t[k+j*n];
- }
- result[i*n+j] = sum0 + sum1 + sum2 + sum3;
- }
- }
- destroy_matrix(rhs_t);
- }
- void mm7(T* lhs, T* rhs, T* result, int n)
- {
- T* rhs_t = make_matrix(n);
- transpose(rhs, rhs_t, n);
- for (int i = 0; i < n; ++i)
- {
- int j = 0;
- for (; j < n - 7; j += 4)
- {
- T sum0 = 0;
- T sum1 = 0;
- T sum2 = 0;
- T sum3 = 0;
- int k = 0;
- for (; k < n - 7; k += 8)
- {
- const T a0 = lhs[i*n+k+0];
- const T a1 = lhs[i*n+k+1];
- const T a2 = lhs[i*n+k+2];
- const T a3 = lhs[i*n+k+3];
- const T a4 = lhs[i*n+k+4];
- const T a5 = lhs[i*n+k+5];
- const T a6 = lhs[i*n+k+6];
- const T a7 = lhs[i*n+k+7];
- sum0 += a0 * rhs_t[(k+0)+j*n];
- sum1 += a0 * rhs_t[(k+0)+(j+1)*n];
- sum2 += a0 * rhs_t[(k+0)+(j+2)*n];
- sum3 += a0 * rhs_t[(k+0)+(j+3)*n];
- sum0 += a1 * rhs_t[(k+1)+j*n];
- sum1 += a1 * rhs_t[(k+1)+(j+1)*n];
- sum2 += a1 * rhs_t[(k+1)+(j+2)*n];
- sum3 += a1 * rhs_t[(k+1)+(j+3)*n];
- sum0 += a2 * rhs_t[(k+2)+j*n];
- sum1 += a2 * rhs_t[(k+2)+(j+1)*n];
- sum2 += a2 * rhs_t[(k+2)+(j+2)*n];
- sum3 += a2 * rhs_t[(k+2)+(j+3)*n];
- sum0 += a3 * rhs_t[(k+3)+j*n];
- sum1 += a3 * rhs_t[(k+3)+(j+1)*n];
- sum2 += a3 * rhs_t[(k+3)+(j+2)*n];
- sum3 += a3 * rhs_t[(k+3)+(j+3)*n];
- sum0 += a4 * rhs_t[(k+4)+j*n];
- sum1 += a4 * rhs_t[(k+4)+(j+1)*n];
- sum2 += a4 * rhs_t[(k+4)+(j+2)*n];
- sum3 += a4 * rhs_t[(k+4)+(j+3)*n];
- sum0 += a5 * rhs_t[(k+5)+j*n];
- sum1 += a5 * rhs_t[(k+5)+(j+1)*n];
- sum2 += a5 * rhs_t[(k+5)+(j+2)*n];
- sum3 += a5 * rhs_t[(k+5)+(j+3)*n];
- sum0 += a6 * rhs_t[(k+6)+j*n];
- sum1 += a6 * rhs_t[(k+6)+(j+1)*n];
- sum2 += a6 * rhs_t[(k+6)+(j+2)*n];
- sum3 += a6 * rhs_t[(k+6)+(j+3)*n];
- sum0 += a7 * rhs_t[(k+7)+j*n];
- sum1 += a7 * rhs_t[(k+7)+(j+1)*n];
- sum2 += a7 * rhs_t[(k+7)+(j+2)*n];
- sum3 += a7 * rhs_t[(k+7)+(j+3)*n];
- }
- result[i*n+j] = sum0;
- result[i*n+j+1] = sum1;
- result[i*n+j+2] = sum2;
- result[i*n+j+3] = sum3;
- }
- }
- destroy_matrix(rhs_t);
- }
- void mm8_d(double* lhs, double* rhs, double* result, int n)
- {
- double* rhs_t = make_matrix(n);
- transpose(rhs, rhs_t, n);
- for (int i = 0; i < n; ++i)
- {
- int j = 0;
- for (; j < n - 7; j += 8)
- {
- __m128d sum0 = _mm_setzero_pd();
- __m128d sum1 = _mm_setzero_pd();
- __m128d sum2 = _mm_setzero_pd();
- __m128d sum3 = _mm_setzero_pd();
- __m128d sum4 = _mm_setzero_pd();
- __m128d sum5 = _mm_setzero_pd();
- __m128d sum6 = _mm_setzero_pd();
- __m128d sum7 = _mm_setzero_pd();
- int k = 0;
- for (; k < n - 7; k += 8)
- {
- const __m128d a01 = _mm_load_pd(&lhs[i*n+k+0]);
- const __m128d a23 = _mm_load_pd(&lhs[i*n+k+2]);
- const __m128d a45 = _mm_load_pd(&lhs[i*n+k+4]);
- const __m128d a67 = _mm_load_pd(&lhs[i*n+k+6]);
- sum0 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+j*n]));
- sum0 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+j*n]));
- sum0 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+j*n]));
- sum0 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+j*n]));
- sum1 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+(j+1)*n]));
- sum1 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+(j+1)*n]));
- sum1 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+(j+1)*n]));
- sum1 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+(j+1)*n]));
- sum2 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+(j+2)*n]));
- sum2 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+(j+2)*n]));
- sum2 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+(j+2)*n]));
- sum2 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+(j+2)*n]));
- sum3 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+(j+3)*n]));
- sum3 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+(j+3)*n]));
- sum3 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+(j+3)*n]));
- sum3 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+(j+3)*n]));
- sum4 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+(j+4)*n]));
- sum4 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+(j+4)*n]));
- sum4 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+(j+4)*n]));
- sum4 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+(j+4)*n]));
- sum5 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+(j+5)*n]));
- sum5 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+(j+5)*n]));
- sum5 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+(j+5)*n]));
- sum1 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+(j+5)*n]));
- sum6 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+(j+6)*n]));
- sum6 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+(j+6)*n]));
- sum6 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+(j+6)*n]));
- sum6 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+(j+6)*n]));
- sum7 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+(j+7)*n]));
- sum7 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+(j+7)*n]));
- sum7 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+(j+7)*n]));
- sum7 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+(j+7)*n]));
- }
- _mm_store_pd(&result[i*n+j], _mm_hadd_pd(sum0, sum1));
- _mm_store_pd(&result[i*n+j+2], _mm_hadd_pd(sum2, sum3));
- _mm_store_pd(&result[i*n+j+4], _mm_hadd_pd(sum4, sum5));
- _mm_store_pd(&result[i*n+j+6], _mm_hadd_pd(sum6, sum7));
- }
- }
- destroy_matrix(rhs_t);
- }
- void mm8_f(float* lhs, float* rhs, float* result, int n)
- {
- float* rhs_t = make_matrix(n);
- transpose(rhs, rhs_t, n);
- for (int i = 0; i < n; ++i)
- {
- int j = 0;
- for (; j < n - 7; j += 8)
- {
- __m128 sum0 = _mm_setzero_ps();
- __m128 sum1 = _mm_setzero_ps();
- __m128 sum2 = _mm_setzero_ps();
- __m128 sum3 = _mm_setzero_ps();
- __m128 sum4 = _mm_setzero_ps();
- __m128 sum5 = _mm_setzero_ps();
- __m128 sum6 = _mm_setzero_ps();
- __m128 sum7 = _mm_setzero_ps();
- int k = 0;
- for (; k < n - 15; k += 16)
- {
- const __m128 a0123 = _mm_load_ps(&lhs[i*n+k+0]);
- const __m128 a4567 = _mm_load_ps(&lhs[i*n+k+4]);
- const __m128 b0123 = _mm_load_ps(&lhs[i*n+k+8]);
- const __m128 b4567 = _mm_load_ps(&lhs[i*n+k+12]);
- sum0 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+j*n]));
- sum0 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+j*n]));
- sum0 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+j*n]));
- sum0 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+j*n]));
- sum1 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+(j+1)*n]));
- sum1 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+(j+1)*n]));
- sum1 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+(j+1)*n]));
- sum1 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+(j+1)*n]));
- sum2 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+(j+2)*n]));
- sum2 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+(j+2)*n]));
- sum3 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+(j+2)*n]));
- sum3 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+(j+2)*n]));
- sum3 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+(j+3)*n]));
- sum3 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+(j+3)*n]));
- sum4 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+(j+3)*n]));
- sum4 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+(j+3)*n]));
- sum4 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+(j+4)*n]));
- sum4 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+(j+4)*n]));
- sum4 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+(j+4)*n]));
- sum4 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+(j+4)*n]));
- sum5 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+(j+5)*n]));
- sum5 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+(j+5)*n]));
- sum5 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+(j+5)*n]));
- sum5 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+(j+5)*n]));
- sum6 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+(j+6)*n]));
- sum6 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+(j+6)*n]));
- sum6 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+(j+6)*n]));
- sum6 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+(j+6)*n]));
- sum7 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+(j+7)*n]));
- sum7 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+(j+7)*n]));
- sum7 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+(j+7)*n]));
- sum7 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+(j+7)*n]));
- }
- __m128 t0 = _mm_hadd_ps(sum0, sum1);
- __m128 t1 = _mm_hadd_ps(sum2, sum3);
- __m128 t2 = _mm_hadd_ps(sum4, sum5);
- __m128 t3 = _mm_hadd_ps(sum6, sum7);
- _mm_store_ps(&result[i*n+j], _mm_hadd_ps(t0, t1));
- _mm_store_ps(&result[i*n+j+4], _mm_hadd_ps(t2, t3));
- }
- }
- destroy_matrix(rhs_t);
- }
- #define SIZE 512
- static double gtod_ref_time_sec = 0.0;
- /* Adapted from the bl2_clock() routine in the BLIS library */
- double dclock() {
- double the_time, norm_sec;
- struct timeval tv;
- gettimeofday(&tv, NULL);
- if (gtod_ref_time_sec == 0.0)
- gtod_ref_time_sec = (double) tv.tv_sec;
- norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
- the_time = norm_sec + tv.tv_usec * 1.0e-6;
- return the_time;
- }
- /* Mulitplication of two matrices */
- int mm(double first[][SIZE], double second[][SIZE], double multiply[][SIZE]) {
- int i, j, k;
- double sum = 0;
- for (i = 0; i < SIZE; i++) { //rows in multiply
- for (j = 0; j < SIZE; j++) { //columns in multiply
- for (k = 0; k < SIZE; k++) { //columns in first and rows in second
- sum = sum + first[i][k] * second[k][j];
- }
- multiply[i][j] = sum;
- sum = 0;
- }
- }
- return 0;
- }
- float time_it(void (*f)(T*, T*, T*, int), int n, float min_t)
- {
- T* lhs = make_matrix(n);
- T* rhs = make_matrix(n);
- T* result = make_matrix(n);
- clock_t start = clock();
- for(int i = 1;; ++i)
- {
- f(lhs, rhs, result, n);
- clock_t end = clock();
- float seconds = (float)(end - start) / CLOCKS_PER_SEC;
- if (seconds >= min_t)
- {
- return seconds / i;
- }
- }
- destroy_matrix(lhs);
- destroy_matrix(rhs);
- destroy_matrix(result);
- }
- int main(int argc, const char* argv[]) {
- int i, j, iret;
- double first[SIZE][SIZE];
- double second[SIZE][SIZE];
- double multiply[SIZE][SIZE];
- double dtime;
- /* PAPI variables */
- int measure;
- int * events;
- long long * values;
- int numevents;
- int retval;
- int check;
- float real_time;
- float proc_time;
- float mflops;
- long long flpins;
- char * errorstring;
- /* Initalize matrices */
- for (i = 0; i < SIZE; i++) { //rows in first
- for (j = 0; j < SIZE; j++) { //columns in first
- first[i][j] = i + j;
- second[i][j] = i - j;
- multiply[i][j] = 0.0;
- }
- }
- measure = 0;
- if (argc > 1) {
- measure = atoi(argv[1]);
- }
- /* Start PAPI counters*/
- if (measure == 1) {
- retval = PAPI_flops(&real_time, &proc_time, &flpins, &mflops);
- if (retval != PAPI_OK) {
- errorstring = PAPI_strerror(retval);
- fprintf(stderr, errorstring);
- fprintf(stderr, "\n");
- free(errorstring);
- exit(1);
- }
- printf("PAPI started\n");
- }
- if (measure == 2) {
- numevents = 2;
- events = malloc(sizeof *events * numevents);
- events[0] = PAPI_L1_ICM;
- events[1] = PAPI_FP_OPS;
- PAPI_library_init(check);
- retval = PAPI_start_counters(events, numevents);
- if (retval != PAPI_OK) {
- errorstring = PAPI_strerror(retval);
- fprintf(stderr, errorstring);
- fprintf(stderr, "\n");
- free(errorstring);
- exit(1);
- }
- printf("PAPI started\n");
- }
- /* Measure execution time */
- if (measure == 0) {
- dtime = dclock();
- }
- /* Here is call to matrix multiplication functionality */
- int n = 1024;
- float min_t = 0.0f;
- time_it(mm0, n, min_t)
- //iret = mm(first, second, multiply);
- /* Measure execution time */
- if (measure == 0) {
- dtime = dclock() - dtime;
- printf("Time: %le \n", dtime);
- }
- /* Here is PAPI reading and printout */
- if (measure == 1) {
- retval = PAPI_flops(&real_time, &proc_time, &flpins, &mflops);
- if (retval != PAPI_OK) {
- errorstring = PAPI_strerror(retval);
- fprintf(stderr, errorstring);
- fprintf(stderr, "\n");
- free(errorstring);
- exit(1);
- }
- printf("Real_time: %f Proc_time: %f Total flpops: %lld MFLOPS: %f\n", real_time, proc_time, flpins, mflops);
- }
- if (measure == 2) {
- values = malloc(sizeof *values * numevents);
- retval = PAPI_stop_counters(values, numevents);
- if (retval != PAPI_OK) {
- errorstring = PAPI_strerror(retval);
- fprintf(stderr, errorstring);
- fprintf(stderr, "\n");
- free(errorstring);
- exit(1);
- }
- printf("Mem loads: %lld Mem store: %lld\n", values[0], values[1]);
- free(values);
- free(events);
- }
- /* Checking part of the code. Proper value is 2.932020e+12 */
- double mcheck = 0.0;
- for (i = 0; i < SIZE; i++) {
- for (j = 0; j < SIZE; j++) {
- mcheck += multiply[i][j];
- }
- }
- printf("check %le \n", mcheck);
- fflush(stdout);
- return iret;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement