Advertisement
Guest User

Untitled

a guest
Sep 25th, 2017
197
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.21 KB | None | 0 0
  1. void sgemm(int m, int n, float *A, float *C) {
  2.  
  3. __m128 r1, r2, r3, r4, r5, r6, r7, r8;
  4.  
  5. __m128 temp;
  6.  
  7. //Load column from original
  8. for (int i = 0; i < n; i++) {
  9. r1 = _mm_loadu_ps(A+i*n+0);
  10. r2 = _mm_loadu_ps(A+i*n+4);
  11. r3 = _mm_loadu_ps(A+i*n+8);
  12. r4 = _mm_loadu_ps(A+i*n+12);
  13. r5 = _mm_loadu_ps(A+i*n+16);
  14. r6 = _mm_loadu_ps(A+i*n+20);
  15. r7 = _mm_loadu_ps(A+i*n+24);
  16. r8 = _mm_loadu_ps(A+i*n+28);
  17. for (int j = 0; j < 32; j++) {
  18. temp = _mm_load1_ps(A+j+i*n); //can improve this further, (float *)&r1 + j
  19. _mm_storeu_ps(C+j*n+0, _mm_add_ps(_mm_loadu_ps(C+j*n+0), _mm_mul_ps(r1, temp)));
  20. _mm_storeu_ps(C+j*n+4, _mm_add_ps(_mm_loadu_ps(C+j*n+4), _mm_mul_ps(r2, temp)));
  21. _mm_storeu_ps(C+j*n+8, _mm_add_ps(_mm_loadu_ps(C+j*n+8), _mm_mul_ps(r3, temp)));
  22. _mm_storeu_ps(C+j*n+12, _mm_add_ps(_mm_loadu_ps(C+j*n+12), _mm_mul_ps(r4, temp)));
  23. _mm_storeu_ps(C+j*n+16, _mm_add_ps(_mm_loadu_ps(C+j*n+16), _mm_mul_ps(r5, temp)));
  24. _mm_storeu_ps(C+j*n+20, _mm_add_ps(_mm_loadu_ps(C+j*n+20), _mm_mul_ps(r6, temp)));
  25. _mm_storeu_ps(C+j*n+24, _mm_add_ps(_mm_loadu_ps(C+j*n+24), _mm_mul_ps(r7, temp)));
  26. _mm_storeu_ps(C+j*n+28, _mm_add_ps(_mm_loadu_ps(C+j*n+28), _mm_mul_ps(r8, temp)));
  27. }
  28. }
  29.  
  30. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement