Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- void sgemm(int m, int n, float *A, float *C) {
- __m128 r1, r2, r3, r4, r5, r6, r7, r8;
- __m128 temp;
- //Load column from original
- for (int i = 0; i < n; i++) {
- r1 = _mm_loadu_ps(A+i*n+0);
- r2 = _mm_loadu_ps(A+i*n+4);
- r3 = _mm_loadu_ps(A+i*n+8);
- r4 = _mm_loadu_ps(A+i*n+12);
- r5 = _mm_loadu_ps(A+i*n+16);
- r6 = _mm_loadu_ps(A+i*n+20);
- r7 = _mm_loadu_ps(A+i*n+24);
- r8 = _mm_loadu_ps(A+i*n+28);
- for (int j = 0; j < 32; j++) {
- temp = _mm_load1_ps(A+j+i*n); //can improve this further, (float *)&r1 + j
- _mm_storeu_ps(C+j*n+0, _mm_add_ps(_mm_loadu_ps(C+j*n+0), _mm_mul_ps(r1, temp)));
- _mm_storeu_ps(C+j*n+4, _mm_add_ps(_mm_loadu_ps(C+j*n+4), _mm_mul_ps(r2, temp)));
- _mm_storeu_ps(C+j*n+8, _mm_add_ps(_mm_loadu_ps(C+j*n+8), _mm_mul_ps(r3, temp)));
- _mm_storeu_ps(C+j*n+12, _mm_add_ps(_mm_loadu_ps(C+j*n+12), _mm_mul_ps(r4, temp)));
- _mm_storeu_ps(C+j*n+16, _mm_add_ps(_mm_loadu_ps(C+j*n+16), _mm_mul_ps(r5, temp)));
- _mm_storeu_ps(C+j*n+20, _mm_add_ps(_mm_loadu_ps(C+j*n+20), _mm_mul_ps(r6, temp)));
- _mm_storeu_ps(C+j*n+24, _mm_add_ps(_mm_loadu_ps(C+j*n+24), _mm_mul_ps(r7, temp)));
- _mm_storeu_ps(C+j*n+28, _mm_add_ps(_mm_loadu_ps(C+j*n+28), _mm_mul_ps(r8, temp)));
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement