Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <emmintrin.h>
- #include <time.h>
- #include <math.h>
- double now()
- {
- return (double)clock() / (double)CLOCKS_PER_SEC;
- }
- void transpose(float *M, int N)
- {
- int i, j;
- for(i = 0; i < N; i++)
- for(j = i + 1; j < N; j++)
- {
- float tmp = M[i*N + j];
- M[i*N + j] = M[j*N + i];
- M[j*N + i] = tmp;
- }
- }
- int main(void)
- {
- double t0, t1, t2;
- float *A, *B, *C, *Correct;
- float *ptrA, *ptrB;
- int N = 1000;
- int i, j, k, n, mode;
- t0 = now();
- A = (float*)_aligned_malloc(N*N*sizeof(float), 16);
- B = (float*)_aligned_malloc(N*N*sizeof(float), 16);
- C = (float*)_aligned_malloc(N*N*sizeof(float), 16);
- Correct = NULL;
- for(i = 0; i < N; i++)
- for(j = 0; j < N; j++)
- {
- A[i*N + j] = (float)(rand() % 1000) / 1000.0f;
- B[i*N + j] = (float)(rand() % 1000) / 1000.0f;
- }
- printf("%dx%d matricies; element size: %d byte(s)\n", N, N, sizeof(float));
- for(mode = 0; mode <= 2; mode++)
- {
- for(n = 0; n < 1; n++)
- {
- printf("Mode: %d; threads: %d\n", mode, n);
- memset(C, 0, N*N*sizeof(float));
- t1 = now() - t0;
- switch(mode)
- {
- case 0:
- for(i = 0; i < N; i++)
- for(j = 0; j < N; j++)
- {
- C[i*N + j] = 0.0f;
- for(k = 0; k < N; k++)
- C[i*N + j] += A[i*N + k]*B[k*N + j];
- }
- break;
- case 1:
- transpose(B, N);
- for(i = 0; i < N; i++)
- for(j = 0; j < N; j++)
- {
- float tmp;
- tmp = 0.0f;
- ptrA = A + i*N;
- ptrB = B + j*N;
- for(k = 0; k < N; k++, ptrA++, ptrB++)
- tmp = tmp + (*ptrA) * (*ptrB);
- C[i*N + j] = tmp;
- }
- transpose(B, N);
- break;
- case 2:
- transpose(B, N);
- for(i = 0; i < N; i++)
- for(j = 0; j < N; j++)
- {
- __m128 tmp;
- tmp = _mm_set1_ps(0.0f);
- ptrA = A + i*N;
- ptrB = B + j*N;
- for(k = 0; k < N/4; k++, ptrA += 4, ptrB += 4)
- tmp = _mm_add_ps(tmp,
- _mm_mul_ps(
- _mm_load_ps(ptrA),
- _mm_load_ps(ptrB)));
- C[i*N + j] = 0.0f;
- for(k = 0; k < 4; k++)
- C[i*N + j] += tmp.m128_f32[k];
- }
- transpose(B, N);
- break;
- }
- t2 = now() - t0;
- printf("Time elapsed: %.3lf ms\n", (t2 - t1) * 1000.0);
- if(!Correct)
- {
- Correct = (float*)_aligned_malloc(N*N*sizeof(float), 16);
- memcpy(Correct, C, N*N*sizeof(float));
- }
- else
- {
- for(i = 0; i < N; i++)
- for(j = 0; j < N; j++)
- if(fabs((double)C[i*N + j] - (double)Correct[i*N + j]) > 1e-2f)
- {
- printf("Wrong result: C[%d,%d] = %lg =/= %lg\n",
- i, j, (double)C[i*N + j], (double)Correct[i*N + j]);
- i = j = N;
- }
- }
- }
- }
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment