Advertisement
Guest User

Untitled

a guest
Feb 8th, 2016
51
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.13 KB | None | 0 0
  1. 68,523 0 0 0 0 0 0 0 0 void mm_48_48_48(int n, double * __restrict__ a, double * __restrict__ b, double * __restrict__ c, int first) {
  2. . . . . . . . . .
  3. . . . . . . . . . static double packed_A[48*48];
  4. . . . . . . . . . int pack_A_indx = 0;
  5. 137,046 0 0 0 0 0 0 0 0 if (first) {
  6. 1,140,051 0 0 0 0 0 0 0 0 for (int k = 0; k < 48; k++) {
  7. 17,647,755 23 11 6,804,864 1,731,584 978,797 6,804,864 1,070,545 8,092 memcpy(&packed_A[pack_A_indx], &a[k*n], sizeof(double) * 48); pack_A_indx += 48;
  8. . . . . . . . . . }}
  9. 13,293,462 0 0 0 0 0 0 0 0 for (int i = 0; i < 48; i++) {
  10. 3,289,104 14 8 3,289,104 3,289,104 90,058 0 0 0 __m256d c0 = _mm256_load_pd((const double *)&c[i*n+0]);
  11. 3,289,104 0 0 3,289,104 362,280 12,005 0 0 0 __m256d c1 = _mm256_load_pd((const double *)&c[i*n+4]);
  12. 3,289,104 0 0 3,289,104 2,926,824 89,371 0 0 0 __m256d c2 = _mm256_load_pd((const double *)&c[i*n+8]);
  13. 3,289,104 0 0 3,289,104 362,280 12,090 0 0 0 __m256d c3 = _mm256_load_pd((const double *)&c[i*n+12]);
  14. 3,289,104 0 0 3,289,104 2,926,824 90,003 0 0 0 __m256d c4 = _mm256_load_pd((const double *)&c[i*n+16]);
  15. 3,289,104 0 0 3,289,104 362,280 12,108 0 0 0 __m256d c5 = _mm256_load_pd((const double *)&c[i*n+20]);
  16. 3,289,104 0 0 3,289,104 2,926,824 90,028 0 0 0 __m256d c6 = _mm256_load_pd((const double *)&c[i*n+24]);
  17. 3,289,104 0 0 3,289,104 362,280 12,126 0 0 0 __m256d c7 = _mm256_load_pd((const double *)&c[i*n+28]);
  18. 3,289,104 12 5 3,289,104 2,926,824 90,521 0 0 0 __m256d c8 = _mm256_load_pd((const double *)&c[i*n+32]);
  19. 3,289,104 0 0 3,289,104 362,280 12,201 0 0 0 __m256d c9 = _mm256_load_pd((const double *)&c[i*n+36]);
  20. 3,289,104 0 0 3,289,104 2,926,824 90,503 0 0 0 __m256d c10 = _mm256_load_pd((const double *)&c[i*n+40]);
  21. 3,289,104 0 0 3,289,104 362,280 12,263 0 0 0 __m256d c11 = _mm256_load_pd((const double *)&c[i*n+44]);
  22. 638,086,176 0 0 0 0 0 0 0 0 for (int k = 0; k < 48; k++) {
  23. 161,166,096 0 0 157,876,992 20,096,904 847,088 0 0 0 __m256d a1 = _mm256_set1_pd(b[i*n+k]);
  24. 315,753,984 0 0 157,876,992 5,529 0 0 0 0 c0 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+0])), c0);
  25. 315,753,984 0 0 157,876,992 58,386 0 0 0 0 c1 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+4])), c1);
  26. 315,753,984 22 9 157,876,992 0 0 0 0 0 c2 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+8])), c2);
  27. 315,753,984 0 0 157,876,992 59,174 0 0 0 0 c3 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+12])), c3);
  28. 315,753,984 0 0 157,876,992 0 0 0 0 0 c4 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+16])), c4);
  29. 315,753,984 15 6 157,876,992 70,557 0 0 0 0 c5 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+20])), c5);
  30. 315,753,984 0 0 157,876,992 0 0 0 0 0 c6 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+24])), c6);
  31. 315,753,984 0 0 157,876,992 81,428 0 0 0 0 c7 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+28])), c7);
  32. 315,753,984 0 0 157,876,992 0 0 0 0 0 c8 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+32])), c8);
  33. 315,753,984 0 0 157,876,992 81,354 0 0 0 0 c9 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+36])), c9);
  34. 315,753,984 0 0 157,876,992 0 0 0 0 0 c10 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+40])), c10);
  35. 315,753,984 16 6 157,876,992 71,160 0 0 0 0 c11 = _mm256_add_pd(_mm256_mul_pd(a1, _mm256_load_pd((const double *)&packed_A[k*48+44])), c11);
  36. . . . . . . . . . }
  37. 3,289,104 0 0 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+0], c0);
  38. 3,289,104 0 0 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+4], c1);
  39. 3,289,104 0 0 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+8], c2);
  40. 3,289,104 0 0 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+12], c3);
  41. 3,289,104 16 6 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+16], c4);
  42. 3,289,104 0 0 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+20], c5);
  43. 3,289,104 0 0 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+24], c6);
  44. 3,289,104 0 0 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+28], c7);
  45. 3,289,104 0 0 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+32], c8);
  46. 3,289,104 0 0 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+36], c9);
  47. 3,289,104 0 0 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+40], c10);
  48. 3,289,104 14 6 0 0 0 3,289,104 0 0 _mm256_store_pd(&c[i*n+44], c11);
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement