Advertisement
Guest User

Untitled

a guest
Sep 25th, 2017
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.19 KB | None | 0 0
  1. void sgemm(int m, int n, float *A, float *C) {
  2.  
  3. __m128 r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, r16;
  4.  
  5. __m128 temp;
  6.  
  7. //Load column from original
  8. for (int i = 0; i < n; i++) {
  9. r1 = _mm_loadu_ps(A+i*n+0);
  10. r2 = _mm_loadu_ps(A+i*n+4);
  11. r3 = _mm_loadu_ps(A+i*n+8);
  12. r4 = _mm_loadu_ps(A+i*n+12);
  13. r5 = _mm_loadu_ps(A+i*n+16);
  14. r6 = _mm_loadu_ps(A+i*n+20);
  15. r7 = _mm_loadu_ps(A+i*n+24);
  16. r8 = _mm_loadu_ps(A+i*n+28);
  17. r9 = _mm_loadu_ps(A+i*n+32);
  18. r10 = _mm_loadu_ps(A+i*n+36);
  19. r11 = _mm_loadu_ps(A+i*n+40);
  20. r12 = _mm_loadu_ps(A+i*n+44);
  21. r13 = _mm_loadu_ps(A+i*n+48);
  22. r14 = _mm_loadu_ps(A+i*n+52);
  23. r15 = _mm_loadu_ps(A+i*n+56);
  24. r16 = _mm_loadu_ps(A+i*n+60);
  25. for (int j = 0; j < 64; j++) {
  26. temp = _mm_load1_ps(A+j+i*n); //can improve this further, (float *)&r1 + j
  27. _mm_storeu_ps(C+j*n+0, _mm_add_ps(_mm_loadu_ps(C+j*n+0), _mm_mul_ps(r1, temp)));
  28. _mm_storeu_ps(C+j*n+4, _mm_add_ps(_mm_loadu_ps(C+j*n+4), _mm_mul_ps(r2, temp)));
  29. _mm_storeu_ps(C+j*n+8, _mm_add_ps(_mm_loadu_ps(C+j*n+8), _mm_mul_ps(r3, temp)));
  30. _mm_storeu_ps(C+j*n+12, _mm_add_ps(_mm_loadu_ps(C+j*n+12), _mm_mul_ps(r4, temp)));
  31. _mm_storeu_ps(C+j*n+16, _mm_add_ps(_mm_loadu_ps(C+j*n+16), _mm_mul_ps(r5, temp)));
  32. _mm_storeu_ps(C+j*n+20, _mm_add_ps(_mm_loadu_ps(C+j*n+20), _mm_mul_ps(r6, temp)));
  33. _mm_storeu_ps(C+j*n+24, _mm_add_ps(_mm_loadu_ps(C+j*n+24), _mm_mul_ps(r7, temp)));
  34. _mm_storeu_ps(C+j*n+28, _mm_add_ps(_mm_loadu_ps(C+j*n+28), _mm_mul_ps(r8, temp)));
  35. _mm_storeu_ps(C+j*n+32, _mm_add_ps(_mm_loadu_ps(C+j*n+32), _mm_mul_ps(r9, temp)));
  36. _mm_storeu_ps(C+j*n+36, _mm_add_ps(_mm_loadu_ps(C+j*n+36), _mm_mul_ps(r10, temp)));
  37. _mm_storeu_ps(C+j*n+40, _mm_add_ps(_mm_loadu_ps(C+j*n+40), _mm_mul_ps(r11, temp)));
  38. _mm_storeu_ps(C+j*n+44, _mm_add_ps(_mm_loadu_ps(C+j*n+44), _mm_mul_ps(r12, temp)));
  39. _mm_storeu_ps(C+j*n+48, _mm_add_ps(_mm_loadu_ps(C+j*n+48), _mm_mul_ps(r13, temp)));
  40. _mm_storeu_ps(C+j*n+52, _mm_add_ps(_mm_loadu_ps(C+j*n+52), _mm_mul_ps(r14, temp)));
  41. _mm_storeu_ps(C+j*n+56, _mm_add_ps(_mm_loadu_ps(C+j*n+56), _mm_mul_ps(r15, temp)));
  42. _mm_storeu_ps(C+j*n+60, _mm_add_ps(_mm_loadu_ps(C+j*n+60), _mm_mul_ps(r16, temp)));
  43. }
  44. }
  45.  
  46. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement