Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- 68,523 0 0 0 0 0 0 0 0 void mm_48_48_48(int n, double * __restrict__ a, double * __restrict__ b, double * __restrict__ c, int first) {
- . . . . . . . . .
- . . . . . . . . . static double packed_A[48*48];
- . . . . . . . . . int pack_A_indx = 0;
- 137,046 0 0 0 0 0 0 0 0 if (first) {
- 1,140,051 0 0 0 0 0 0 0 0 for (int k = 0; k < 48; k++) {
- 17,647,755 23 11 6,804,864 1,731,584 978,797 6,804,864 1,070,545 8,092 memcpy(&packed_A[pack_A_indx], &a[k*n], sizeof(double) * 48); pack_A_indx += 48;
- . . . . . . . . . }}
- 13,293,462 0 0 0 0 0 0 0 0 for (int i = 0; i < 48; i++) {
- 3,289,104 14 8 3,289,104 3,289,104 90,058 0 0 0 __m256d c0 = _mm256_load_pd((const double *)&c[i*n+0]);
- 3,289,104 0 0 3,289,104 362,280 12,005 0 0 0 __m256d c1 = _mm256_load_pd((const double *)&c[i*n+4]);
- 3,289,104 0 0 3,289,104 2,926,824 89,371 0 0 0 __m256d c2 = _mm256_load_pd((const double *)&c[i*n+8]);
- 3,289,104 0 0 3,289,104 362,280 12,090 0 0 0 __m256d c3 = _mm256_load_pd((const double *)&c[i*n+12]);
- 3,289,104 0 0 3,289,104 2,926,824 90,003 0 0 0 __m256d c4 = _mm256_load_pd((const double *)&c[i*n+16]);
- 3,289,104 0 0 3,289,104 362,280 12,108 0 0 0 __m256d c5 = _mm256_load_pd((const double *)&c[i*n+20]);
- 3,289,104 0 0 3,289,104 2,926,824 90,028 0 0 0 __m256d c6 = _mm256_load_pd((const double *)&c[i*n+24]);
- 3,289,104 0 0 3,289,104 362,280 12,126 0 0 0 __m256d c7 = _mm256_load_pd((const double *)&c[i*n+28]);
- 3,289,104 12 5 3,289,104 2,926,824 90,521 0 0 0 __m256d c8 = _mm256_load_pd((const double *)&c[i*n+32]);
- 3,289,104 0 0 3,289,104 362,280 12,201 0 0 0 __m256d c9 = _mm256_load_pd((const double *)&c[i*n+36]);
- 3,289,104 0 0 3,289,104 2,926,824 90,503 0 0 0 __m256d c10 = _mm256_load_pd((const double *)&c[i*n+40]);
- 3,289,104 0 0 3,289,104 362,280 12,263 0 0 0 __m256d c11 = _mm256_load_pd((const double *)&c[i*n+44]);
- 638,086,176 0 0 0 0 0 0 0 0 for (int k = 0; k < 48; k++) {
- 161,166,096 0 0 157,876,992 20,096,904 847,088 0 0 0 __m256d a1 = _mm256_set1_pd(b[i*n+k]);
- 315,753,984 0 0 157,876,992 5,529 0 0 0 0 c0 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+0])), c0);
- 315,753,984 0 0 157,876,992 58,386 0 0 0 0 c1 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+4])), c1);
- 315,753,984 22 9 157,876,992 0 0 0 0 0 c2 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+8])), c2);
- 315,753,984 0 0 157,876,992 59,174 0 0 0 0 c3 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+12])), c3);
- 315,753,984 0 0 157,876,992 0 0 0 0 0 c4 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+16])), c4);
- 315,753,984 15 6 157,876,992 70,557 0 0 0 0 c5 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+20])), c5);
- 315,753,984 0 0 157,876,992 0 0 0 0 0 c6 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+24])), c6);
- 315,753,984 0 0 157,876,992 81,428 0 0 0 0 c7 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+28])), c7);
- 315,753,984 0 0 157,876,992 0 0 0 0 0 c8 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+32])), c8);
- 315,753,984 0 0 157,876,992 81,354 0 0 0 0 c9 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+36])), c9);
- 315,753,984 0 0 157,876,992 0 0 0 0 0 c10 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+40])), c10);
- 315,753,984 16 6 157,876,992 71,160 0 0 0 0 c11 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+44])), c11);
- . . . . . . . . . }
- 3,289,104 0 0 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+0], c0);
- 3,289,104 0 0 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+4], c1);
- 3,289,104 0 0 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+8], c2);
- 3,289,104 0 0 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+12], c3);
- 3,289,104 16 6 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+16], c4);
- 3,289,104 0 0 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+20], c5);
- 3,289,104 0 0 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+24], c6);
- 3,289,104 0 0 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+28], c7);
- 3,289,104 0 0 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+32], c8);
- 3,289,104 0 0 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+36], c9);
- 3,289,104 0 0 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+40], c10);
- 3,289,104 14 6 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+44], c11);
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement