Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- void sgemm(int m, int n, float *A, float *C) {
- __m128 r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, r16;
- __m128 temp;
- //Load column from original
- for (int i = 0; i < n; i++) {
- r1 = _mm_loadu_ps(A+i*n+0);
- r2 = _mm_loadu_ps(A+i*n+4);
- r3 = _mm_loadu_ps(A+i*n+8);
- r4 = _mm_loadu_ps(A+i*n+12);
- r5 = _mm_loadu_ps(A+i*n+16);
- r6 = _mm_loadu_ps(A+i*n+20);
- r7 = _mm_loadu_ps(A+i*n+24);
- r8 = _mm_loadu_ps(A+i*n+28);
- r9 = _mm_loadu_ps(A+i*n+32);
- r10 = _mm_loadu_ps(A+i*n+36);
- r11 = _mm_loadu_ps(A+i*n+40);
- r12 = _mm_loadu_ps(A+i*n+44);
- r13 = _mm_loadu_ps(A+i*n+48);
- r14 = _mm_loadu_ps(A+i*n+52);
- r15 = _mm_loadu_ps(A+i*n+56);
- r16 = _mm_loadu_ps(A+i*n+60);
- for (int j = 0; j < 64; j++) {
- temp = _mm_load1_ps(A+j+i*n); //can improve this further, (float *)&r1 + j
- _mm_storeu_ps(C+j*n+0, _mm_add_ps(_mm_loadu_ps(C+j*n+0), _mm_mul_ps(r1, temp)));
- _mm_storeu_ps(C+j*n+4, _mm_add_ps(_mm_loadu_ps(C+j*n+4), _mm_mul_ps(r2, temp)));
- _mm_storeu_ps(C+j*n+8, _mm_add_ps(_mm_loadu_ps(C+j*n+8), _mm_mul_ps(r3, temp)));
- _mm_storeu_ps(C+j*n+12, _mm_add_ps(_mm_loadu_ps(C+j*n+12), _mm_mul_ps(r4, temp)));
- _mm_storeu_ps(C+j*n+16, _mm_add_ps(_mm_loadu_ps(C+j*n+16), _mm_mul_ps(r5, temp)));
- _mm_storeu_ps(C+j*n+20, _mm_add_ps(_mm_loadu_ps(C+j*n+20), _mm_mul_ps(r6, temp)));
- _mm_storeu_ps(C+j*n+24, _mm_add_ps(_mm_loadu_ps(C+j*n+24), _mm_mul_ps(r7, temp)));
- _mm_storeu_ps(C+j*n+28, _mm_add_ps(_mm_loadu_ps(C+j*n+28), _mm_mul_ps(r8, temp)));
- _mm_storeu_ps(C+j*n+32, _mm_add_ps(_mm_loadu_ps(C+j*n+32), _mm_mul_ps(r9, temp)));
- _mm_storeu_ps(C+j*n+36, _mm_add_ps(_mm_loadu_ps(C+j*n+36), _mm_mul_ps(r10, temp)));
- _mm_storeu_ps(C+j*n+40, _mm_add_ps(_mm_loadu_ps(C+j*n+40), _mm_mul_ps(r11, temp)));
- _mm_storeu_ps(C+j*n+44, _mm_add_ps(_mm_loadu_ps(C+j*n+44), _mm_mul_ps(r12, temp)));
- _mm_storeu_ps(C+j*n+48, _mm_add_ps(_mm_loadu_ps(C+j*n+48), _mm_mul_ps(r13, temp)));
- _mm_storeu_ps(C+j*n+52, _mm_add_ps(_mm_loadu_ps(C+j*n+52), _mm_mul_ps(r14, temp)));
- _mm_storeu_ps(C+j*n+56, _mm_add_ps(_mm_loadu_ps(C+j*n+56), _mm_mul_ps(r15, temp)));
- _mm_storeu_ps(C+j*n+60, _mm_add_ps(_mm_loadu_ps(C+j*n+60), _mm_mul_ps(r16, temp)));
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement