Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <time.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <mmintrin.h>
- #include <xmmintrin.h> // SSE
- #include <pmmintrin.h> // SSE2
- #include <emmintrin.h> // SSE3
- #define T float
- T* make_matrix(int n)
- {
- T* mat = (T*)malloc(sizeof(T)*n*n);
- for (int i = 0; i < n*n; ++i)
- {
- mat[i] = rand() / RAND_MAX;
- }
- return mat;
- }
- void destroy_matrix(T* m)
- {
- free(m);
- }
- void mm0(T* lhs, T* rhs, T* result, int n)
- {
- for (int i = 0; i < n; ++i)
- {
- for (int j = 0; j < n; ++j)
- {
- T sum = 0;
- for (int k = 0; k < n; ++k)
- {
- sum += lhs[i*n+k] * rhs[k*n+j];
- }
- result[i*n+j] = sum;
- }
- }
- }
- void mm1(T* lhs, T* rhs, T* result, int n)
- {
- for (int i = n; --i;)
- {
- for (int j = n; --j;)
- {
- T sum = 0;
- for (int k = n; --k;)
- {
- sum += lhs[i*n+k] * rhs[k*n+j];
- }
- result[i*n+j] = sum;
- }
- }
- }
- void mm2(T* lhs, T* rhs, T* result, int n)
- {
- for (int i = n; --i;)
- {
- for (int j = n; --j;)
- {
- T sum = 0;
- int k = 0;
- for (; k < n - 7; k += 8)
- {
- sum += lhs[i*n+k] * rhs[k*n+j];
- sum += lhs[i*n+k+1] * rhs[(k+1)*n+j];
- sum += lhs[i*n+k+2] * rhs[(k+2)*n+j];
- sum += lhs[i*n+k+3] * rhs[(k+3)*n+j];
- sum += lhs[i*n+k+4] * rhs[(k+4)*n+j];
- sum += lhs[i*n+k+5] * rhs[(k+5)*n+j];
- sum += lhs[i*n+k+6] * rhs[(k+6)*n+j];
- sum += lhs[i*n+k+7] * rhs[(k+7)*n+j];
- }
- for (; k < n; ++k)
- {
- sum += lhs[i*n+k] * rhs[k*n+j];
- }
- result[i*n+j] = sum;
- }
- }
- }
- void mm3(T* lhs, T* rhs, T* result, int n)
- {
- for (int i = n; --i;)
- {
- for (int j = n; --j;)
- {
- T sum = 0;
- int k = 0;
- for (; k < n - 15; k += 16)
- {
- sum += lhs[i*n+k] * rhs[k*n+j];
- sum += lhs[i*n+k+1] * rhs[(k+1)*n+j];
- sum += lhs[i*n+k+2] * rhs[(k+2)*n+j];
- sum += lhs[i*n+k+3] * rhs[(k+3)*n+j];
- sum += lhs[i*n+k+4] * rhs[(k+4)*n+j];
- sum += lhs[i*n+k+5] * rhs[(k+5)*n+j];
- sum += lhs[i*n+k+6] * rhs[(k+6)*n+j];
- sum += lhs[i*n+k+7] * rhs[(k+7)*n+j];
- sum += lhs[i*n+k+8] * rhs[(k+8)*n+j];
- sum += lhs[i*n+k+9] * rhs[(k+9)*n+j];
- sum += lhs[i*n+k+10] * rhs[(k+10)*n+j];
- sum += lhs[i*n+k+11] * rhs[(k+11)*n+j];
- sum += lhs[i*n+k+12] * rhs[(k+12)*n+j];
- sum += lhs[i*n+k+13] * rhs[(k+13)*n+j];
- sum += lhs[i*n+k+14] * rhs[(k+14)*n+j];
- sum += lhs[i*n+k+15] * rhs[(k+15)*n+j];
- }
- for (; k < n; ++k)
- {
- sum += lhs[i*n+k] * rhs[k*n+j];
- }
- result[i*n+j] = sum;
- }
- }
- }
- void mm4(T* lhs, T* rhs, T* result, int n)
- {
- for (int i = n; --i;)
- {
- for (int j = n; --j;)
- {
- T sum = 0;
- int k = 0;
- for (; k < n - 31; k += 32)
- {
- sum += lhs[i*n+k] * rhs[k*n+j];
- sum += lhs[i*n+k+1] * rhs[(k+1)*n+j];
- sum += lhs[i*n+k+2] * rhs[(k+2)*n+j];
- sum += lhs[i*n+k+3] * rhs[(k+3)*n+j];
- sum += lhs[i*n+k+4] * rhs[(k+4)*n+j];
- sum += lhs[i*n+k+5] * rhs[(k+5)*n+j];
- sum += lhs[i*n+k+6] * rhs[(k+6)*n+j];
- sum += lhs[i*n+k+7] * rhs[(k+7)*n+j];
- sum += lhs[i*n+k+8] * rhs[(k+8)*n+j];
- sum += lhs[i*n+k+9] * rhs[(k+9)*n+j];
- sum += lhs[i*n+k+10] * rhs[(k+10)*n+j];
- sum += lhs[i*n+k+11] * rhs[(k+11)*n+j];
- sum += lhs[i*n+k+12] * rhs[(k+12)*n+j];
- sum += lhs[i*n+k+13] * rhs[(k+13)*n+j];
- sum += lhs[i*n+k+14] * rhs[(k+14)*n+j];
- sum += lhs[i*n+k+15] * rhs[(k+15)*n+j];
- sum += lhs[i*n+k+16] * rhs[(k+16)*n+j];
- sum += lhs[i*n+k+17] * rhs[(k+17)*n+j];
- sum += lhs[i*n+k+18] * rhs[(k+18)*n+j];
- sum += lhs[i*n+k+19] * rhs[(k+19)*n+j];
- sum += lhs[i*n+k+20] * rhs[(k+20)*n+j];
- sum += lhs[i*n+k+21] * rhs[(k+21)*n+j];
- sum += lhs[i*n+k+22] * rhs[(k+22)*n+j];
- sum += lhs[i*n+k+23] * rhs[(k+23)*n+j];
- sum += lhs[i*n+k+24] * rhs[(k+24)*n+j];
- sum += lhs[i*n+k+25] * rhs[(k+25)*n+j];
- sum += lhs[i*n+k+26] * rhs[(k+26)*n+j];
- sum += lhs[i*n+k+27] * rhs[(k+27)*n+j];
- sum += lhs[i*n+k+28] * rhs[(k+28)*n+j];
- sum += lhs[i*n+k+29] * rhs[(k+29)*n+j];
- sum += lhs[i*n+k+30] * rhs[(k+30)*n+j];
- sum += lhs[i*n+k+31] * rhs[(k+31)*n+j];
- }
- for (; k < n; ++k)
- {
- sum += lhs[i*n+k] * rhs[k*n+j];
- }
- result[i*n+j] = sum;
- }
- }
- }
- void transpose(T* from, T* to, int n)
- {
- for (int i = 0; i < n; ++i)
- {
- for (int j = 0; j < n; ++j)
- {
- to[i*n+j] = from[i+j*n];
- }
- }
- }
- void mm5(T* lhs, T* rhs, T* result, int n)
- {
- T* rhs_t = make_matrix(n);
- transpose(rhs, rhs_t, n);
- for (int i = n; --i;)
- {
- for (int j = n; --j;)
- {
- T sum = 0;
- int k = 0;
- for (; k < n - 31; k += 32)
- {
- sum += lhs[i*n+k] * rhs_t[k+j*n];
- sum += lhs[i*n+k+1] * rhs_t[(k+1)+j*n];
- sum += lhs[i*n+k+2] * rhs_t[(k+2)+j*n];
- sum += lhs[i*n+k+3] * rhs_t[(k+3)+j*n];
- sum += lhs[i*n+k+4] * rhs_t[(k+4)+j*n];
- sum += lhs[i*n+k+5] * rhs_t[(k+5)+j*n];
- sum += lhs[i*n+k+6] * rhs_t[(k+6)+j*n];
- sum += lhs[i*n+k+7] * rhs_t[(k+7)+j*n];
- sum += lhs[i*n+k+8] * rhs_t[(k+8)+j*n];
- sum += lhs[i*n+k+9] * rhs_t[(k+9)+j*n];
- sum += lhs[i*n+k+10] * rhs_t[(k+10)+j*n];
- sum += lhs[i*n+k+11] * rhs_t[(k+11)+j*n];
- sum += lhs[i*n+k+12] * rhs_t[(k+12)+j*n];
- sum += lhs[i*n+k+13] * rhs_t[(k+13)+j*n];
- sum += lhs[i*n+k+14] * rhs_t[(k+14)+j*n];
- sum += lhs[i*n+k+15] * rhs_t[(k+15)+j*n];
- sum += lhs[i*n+k+16] * rhs_t[(k+16)+j*n];
- sum += lhs[i*n+k+17] * rhs_t[(k+17)+j*n];
- sum += lhs[i*n+k+18] * rhs_t[(k+18)+j*n];
- sum += lhs[i*n+k+19] * rhs_t[(k+19)+j*n];
- sum += lhs[i*n+k+20] * rhs_t[(k+20)+j*n];
- sum += lhs[i*n+k+21] * rhs_t[(k+21)+j*n];
- sum += lhs[i*n+k+22] * rhs_t[(k+22)+j*n];
- sum += lhs[i*n+k+23] * rhs_t[(k+23)+j*n];
- sum += lhs[i*n+k+24] * rhs_t[(k+24)+j*n];
- sum += lhs[i*n+k+25] * rhs_t[(k+25)+j*n];
- sum += lhs[i*n+k+26] * rhs_t[(k+26)+j*n];
- sum += lhs[i*n+k+27] * rhs_t[(k+27)+j*n];
- sum += lhs[i*n+k+28] * rhs_t[(k+28)+j*n];
- sum += lhs[i*n+k+29] * rhs_t[(k+29)+j*n];
- sum += lhs[i*n+k+30] * rhs_t[(k+30)+j*n];
- sum += lhs[i*n+k+31] * rhs_t[(k+31)+j*n];
- }
- for (; k < n; ++k)
- {
- sum += lhs[i*n+k] * rhs_t[k+j*n];
- }
- result[i*n+j] = sum;
- }
- }
- destroy_matrix(rhs_t);
- }
- void mm6(T* lhs, T* rhs, T* result, int n)
- {
- T* rhs_t = make_matrix(n);
- transpose(rhs, rhs_t, n);
- for (int i = n; --i;)
- {
- for (int j = n; --j;)
- {
- T sum0 = 0;
- T sum1 = 0;
- T sum2 = 0;
- T sum3 = 0;
- int k = 0;
- for (; k < n - 7; k += 8)
- {
- sum0 += lhs[i*n+k] * rhs_t[k+j*n];
- sum1 += lhs[i*n+k+1] * rhs_t[(k+1)+j*n];
- sum2 += lhs[i*n+k+2] * rhs_t[(k+2)+j*n];
- sum3 += lhs[i*n+k+3] * rhs_t[(k+3)+j*n];
- sum0 += lhs[i*n+k+4] * rhs_t[(k+4)+j*n];
- sum1 += lhs[i*n+k+5] * rhs_t[(k+5)+j*n];
- sum2 += lhs[i*n+k+6] * rhs_t[(k+6)+j*n];
- sum3 += lhs[i*n+k+7] * rhs_t[(k+7)+j*n];
- }
- for (; k < n; ++k)
- {
- sum0 += lhs[i*n+k] * rhs_t[k+j*n];
- }
- result[i*n+j] = sum0 + sum1 + sum2 + sum3;
- }
- }
- destroy_matrix(rhs_t);
- }
- void mm7(T* lhs, T* rhs, T* result, int n)
- {
- T* rhs_t = make_matrix(n);
- transpose(rhs, rhs_t, n);
- for (int i = 0; i < n; ++i)
- {
- int j = 0;
- for (; j < n - 7; j += 4)
- {
- T sum0 = 0;
- T sum1 = 0;
- T sum2 = 0;
- T sum3 = 0;
- int k = 0;
- for (; k < n - 7; k += 8)
- {
- const T a0 = lhs[i*n+k+0];
- const T a1 = lhs[i*n+k+1];
- const T a2 = lhs[i*n+k+2];
- const T a3 = lhs[i*n+k+3];
- const T a4 = lhs[i*n+k+4];
- const T a5 = lhs[i*n+k+5];
- const T a6 = lhs[i*n+k+6];
- const T a7 = lhs[i*n+k+7];
- sum0 += a0 * rhs_t[(k+0)+j*n];
- sum1 += a0 * rhs_t[(k+0)+(j+1)*n];
- sum2 += a0 * rhs_t[(k+0)+(j+2)*n];
- sum3 += a0 * rhs_t[(k+0)+(j+3)*n];
- sum0 += a1 * rhs_t[(k+1)+j*n];
- sum1 += a1 * rhs_t[(k+1)+(j+1)*n];
- sum2 += a1 * rhs_t[(k+1)+(j+2)*n];
- sum3 += a1 * rhs_t[(k+1)+(j+3)*n];
- sum0 += a2 * rhs_t[(k+2)+j*n];
- sum1 += a2 * rhs_t[(k+2)+(j+1)*n];
- sum2 += a2 * rhs_t[(k+2)+(j+2)*n];
- sum3 += a2 * rhs_t[(k+2)+(j+3)*n];
- sum0 += a3 * rhs_t[(k+3)+j*n];
- sum1 += a3 * rhs_t[(k+3)+(j+1)*n];
- sum2 += a3 * rhs_t[(k+3)+(j+2)*n];
- sum3 += a3 * rhs_t[(k+3)+(j+3)*n];
- sum0 += a4 * rhs_t[(k+4)+j*n];
- sum1 += a4 * rhs_t[(k+4)+(j+1)*n];
- sum2 += a4 * rhs_t[(k+4)+(j+2)*n];
- sum3 += a4 * rhs_t[(k+4)+(j+3)*n];
- sum0 += a5 * rhs_t[(k+5)+j*n];
- sum1 += a5 * rhs_t[(k+5)+(j+1)*n];
- sum2 += a5 * rhs_t[(k+5)+(j+2)*n];
- sum3 += a5 * rhs_t[(k+5)+(j+3)*n];
- sum0 += a6 * rhs_t[(k+6)+j*n];
- sum1 += a6 * rhs_t[(k+6)+(j+1)*n];
- sum2 += a6 * rhs_t[(k+6)+(j+2)*n];
- sum3 += a6 * rhs_t[(k+6)+(j+3)*n];
- sum0 += a7 * rhs_t[(k+7)+j*n];
- sum1 += a7 * rhs_t[(k+7)+(j+1)*n];
- sum2 += a7 * rhs_t[(k+7)+(j+2)*n];
- sum3 += a7 * rhs_t[(k+7)+(j+3)*n];
- }
- result[i*n+j] = sum0;
- result[i*n+j+1] = sum1;
- result[i*n+j+2] = sum2;
- result[i*n+j+3] = sum3;
- }
- }
- destroy_matrix(rhs_t);
- }
- void mm8_d(double* lhs, double* rhs, double* result, int n)
- {
- double* rhs_t = make_matrix(n);
- transpose(rhs, rhs_t, n);
- for (int i = 0; i < n; ++i)
- {
- int j = 0;
- for (; j < n - 7; j += 8)
- {
- __m128d sum0 = _mm_setzero_pd();
- __m128d sum1 = _mm_setzero_pd();
- __m128d sum2 = _mm_setzero_pd();
- __m128d sum3 = _mm_setzero_pd();
- __m128d sum4 = _mm_setzero_pd();
- __m128d sum5 = _mm_setzero_pd();
- __m128d sum6 = _mm_setzero_pd();
- __m128d sum7 = _mm_setzero_pd();
- int k = 0;
- for (; k < n - 7; k += 8)
- {
- const __m128d a01 = _mm_load_pd(&lhs[i*n+k+0]);
- const __m128d a23 = _mm_load_pd(&lhs[i*n+k+2]);
- const __m128d a45 = _mm_load_pd(&lhs[i*n+k+4]);
- const __m128d a67 = _mm_load_pd(&lhs[i*n+k+6]);
- sum0 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+j*n]));
- sum0 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+j*n]));
- sum0 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+j*n]));
- sum0 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+j*n]));
- sum1 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+(j+1)*n]));
- sum1 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+(j+1)*n]));
- sum1 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+(j+1)*n]));
- sum1 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+(j+1)*n]));
- sum2 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+(j+2)*n]));
- sum2 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+(j+2)*n]));
- sum2 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+(j+2)*n]));
- sum2 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+(j+2)*n]));
- sum3 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+(j+3)*n]));
- sum3 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+(j+3)*n]));
- sum3 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+(j+3)*n]));
- sum3 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+(j+3)*n]));
- sum4 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+(j+4)*n]));
- sum4 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+(j+4)*n]));
- sum4 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+(j+4)*n]));
- sum4 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+(j+4)*n]));
- sum5 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+(j+5)*n]));
- sum5 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+(j+5)*n]));
- sum5 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+(j+5)*n]));
- sum1 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+(j+5)*n]));
- sum6 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+(j+6)*n]));
- sum6 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+(j+6)*n]));
- sum6 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+(j+6)*n]));
- sum6 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+(j+6)*n]));
- sum7 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+(j+7)*n]));
- sum7 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+(j+7)*n]));
- sum7 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+(j+7)*n]));
- sum7 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+(j+7)*n]));
- }
- _mm_store_pd(&result[i*n+j], _mm_hadd_pd(sum0, sum1));
- _mm_store_pd(&result[i*n+j+2], _mm_hadd_pd(sum2, sum3));
- _mm_store_pd(&result[i*n+j+4], _mm_hadd_pd(sum4, sum5));
- _mm_store_pd(&result[i*n+j+6], _mm_hadd_pd(sum6, sum7));
- }
- }
- destroy_matrix(rhs_t);
- }
- void mm8_f(float* lhs, float* rhs, float* result, int n)
- {
- float* rhs_t = make_matrix(n);
- transpose(rhs, rhs_t, n);
- for (int i = 0; i < n; ++i)
- {
- int j = 0;
- for (; j < n - 7; j += 8)
- {
- __m128 sum0 = _mm_setzero_ps();
- __m128 sum1 = _mm_setzero_ps();
- __m128 sum2 = _mm_setzero_ps();
- __m128 sum3 = _mm_setzero_ps();
- __m128 sum4 = _mm_setzero_ps();
- __m128 sum5 = _mm_setzero_ps();
- __m128 sum6 = _mm_setzero_ps();
- __m128 sum7 = _mm_setzero_ps();
- int k = 0;
- for (; k < n - 15; k += 16)
- {
- const __m128 a0123 = _mm_load_ps(&lhs[i*n+k+0]);
- const __m128 a4567 = _mm_load_ps(&lhs[i*n+k+4]);
- const __m128 b0123 = _mm_load_ps(&lhs[i*n+k+8]);
- const __m128 b4567 = _mm_load_ps(&lhs[i*n+k+12]);
- sum0 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+j*n]));
- sum0 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+j*n]));
- sum0 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+j*n]));
- sum0 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+j*n]));
- sum1 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+(j+1)*n]));
- sum1 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+(j+1)*n]));
- sum1 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+(j+1)*n]));
- sum1 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+(j+1)*n]));
- sum2 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+(j+2)*n]));
- sum2 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+(j+2)*n]));
- sum3 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+(j+2)*n]));
- sum3 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+(j+2)*n]));
- sum3 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+(j+3)*n]));
- sum3 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+(j+3)*n]));
- sum4 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+(j+3)*n]));
- sum4 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+(j+3)*n]));
- sum4 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+(j+4)*n]));
- sum4 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+(j+4)*n]));
- sum4 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+(j+4)*n]));
- sum4 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+(j+4)*n]));
- sum5 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+(j+5)*n]));
- sum5 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+(j+5)*n]));
- sum5 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+(j+5)*n]));
- sum5 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+(j+5)*n]));
- sum6 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+(j+6)*n]));
- sum6 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+(j+6)*n]));
- sum6 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+(j+6)*n]));
- sum6 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+(j+6)*n]));
- sum7 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+(j+7)*n]));
- sum7 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+(j+7)*n]));
- sum7 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+(j+7)*n]));
- sum7 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+(j+7)*n]));
- }
- __m128 t0 = _mm_hadd_ps(sum0, sum1);
- __m128 t1 = _mm_hadd_ps(sum2, sum3);
- __m128 t2 = _mm_hadd_ps(sum4, sum5);
- __m128 t3 = _mm_hadd_ps(sum6, sum7);
- _mm_store_ps(&result[i*n+j], _mm_hadd_ps(t0, t1));
- _mm_store_ps(&result[i*n+j+4], _mm_hadd_ps(t2, t3));
- }
- }
- destroy_matrix(rhs_t);
- }
- float time_it(void (*f)(T*, T*, T*, int), int n, float min_t)
- {
- T* lhs = make_matrix(n);
- T* rhs = make_matrix(n);
- T* result = make_matrix(n);
- clock_t start = clock();
- for(int i = 1;; ++i)
- {
- f(lhs, rhs, result, n);
- clock_t end = clock();
- float seconds = (float)(end - start) / CLOCKS_PER_SEC;
- if (seconds > min_t)
- {
- return seconds / i;
- }
- }
- destroy_matrix(lhs);
- destroy_matrix(rhs);
- destroy_matrix(result);
- }
- int main()
- {
- const int n = 512;
- const float min_t = 4.0f;
- /*
- printf("mm0: %f\n", time_it(mm0, n, min_t));
- printf("mm1: %f\n", time_it(mm1, n, min_t));
- printf("mm2: %f\n", time_it(mm2, n, min_t));
- printf("mm3: %f\n", time_it(mm3, n, min_t));
- printf("mm4: %f\n", time_it(mm4, n, min_t));
- printf("mm5: %f\n", time_it(mm5, n, min_t));
- printf("mm6: %f\n", time_it(mm6, n, min_t));
- printf("mm7: %f\n", time_it(mm7, n, min_t));
- printf("mm8: %f\n", time_it(mm8_f, n, min_t));
- */
- printf("mm8: %f\n", time_it(mm8, n, min_t));
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement