Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // matrix.cpp : main project file.
- #include "stdafx.h"
- #include <conio.h>
- #include <Windows.h>
- #include <tchar.h>
- #include <time.h>
- #include <cmath>
- #include <intrin.h>
- #include <immintrin.h>
- #define QPF(f) QueryPerformanceFrequency(f)
- #define QPC(f) QueryPerformanceCounter(f)
- #define eps 0.5f
- typedef unsigned __int64 uint64;
- using namespace System;
- using namespace std;
- LARGE_INTEGER frequency, st, f;
- uint64 start, finish = 0xFFFFFFFF;
- double sec;
- static void StartTimer() {
- QPC(&st);
- start = st.QuadPart;
- }
- static void EndTimer(LONGLONG fr, double *sec) {
- QPC(&f);
- finish = f.QuadPart;
- *sec = (finish - start) / (double)fr;
- }
- class Matrix {
- public:
- static void randomize(float* a, size_t n) {
- for (size_t i = 0; i < n; i++) {
- for (size_t j = 0; j < n; j++) {
- a[i * n + j] = (float)(rand() % 10) + 1.0f;
- }
- }
- }
- static void print(float* a, size_t n) {
- for (size_t i = 0; i < n; i++) {
- for (size_t j = 0; j < n; j++) {
- _tprintf(_T("%f "), a[i * n + j]);
- }
- _tprintf(_T("\n"));
- }
- _tprintf(_T("-----------\n\n"));
- }
- static void swap(float* a, size_t n, size_t r1, size_t r2) {
- float tmp;
- for (size_t j = 0; j < n; j++) {
- tmp = a[n * r1 + j];
- a[n * r1 + j] = a[n * r2 + j];
- a[n * r2 + j] = tmp;
- }
- }
- static void add(float* a, float* b, float* c, size_t n) {
- for (size_t i = 0; i < n; i++) {
- for (size_t j = 0; j < n; j++) {
- c[i * n + j] = a[i * n + j] + b[i * n + j];
- }
- }
- }
- static void sub(float* a, float* b, float* c, size_t n) {
- for (size_t i = 0; i < n; i++) {
- for (size_t j = 0; j < n; j++) {
- c[i * n + j] = a[i * n + j] - b[i * n + j];
- }
- }
- }
- static void SSEadd(float* a, float* b, float* c, size_t n) {
- __m128* pa = (__m128*) a;
- __m128* pb = (__m128*) b;
- __m128* pc = (__m128*) c;
- for (size_t i = 0; i < n * n / 4; i += 4) {
- pc[i] = _mm_add_ps(pa[i], pb[i]);
- pc[i + 1] = _mm_add_ps(pa[i + 1], pb[i + 1]);
- pc[i + 2] = _mm_add_ps(pa[i + 2], pb[i + 2]);
- pc[i + 3] = _mm_add_ps(pa[i + 3], pb[i + 3]);
- }
- }
- /*static void AVXadd(float* a, float* b, float* c, size_t n) {
- __m256* pa = (__m256*) a;
- __m256* pb = (__m256*) b;
- __m256* pc = (__m256*) c;
- for (size_t i = 0; i < n * n / 8; i ++) {
- pc[i] = _mm256_add_ps(pa[i], pb[i]);
- }
- }*/
- static void SSEsub(float* a, float* b, float* c, size_t n) {
- __m128* pa = (__m128*) a;
- __m128* pb = (__m128*) b;
- __m128* pc = (__m128*) c;
- for (size_t i = 0; i < n * n / 4; i += 4) {
- pc[i] = _mm_sub_ps(pa[i], pb[i]);
- pc[i + 1] = _mm_sub_ps(pa[i + 1], pb[i + 1]);
- pc[i + 2] = _mm_sub_ps(pa[i + 2], pb[i + 2]);
- pc[i + 3] = _mm_sub_ps(pa[i + 3], pb[i + 3]);
- }
- }
- static void mul(float* a, float* b, float* c, size_t n) {
- for (size_t i = 0; i < n; i++) {
- for (size_t j = 0; j < n; j++) {
- for (size_t k = 0; k < n; k++) {
- c[i * n + k] += a[i * n + j] * b[j * n + k];
- }
- }
- }
- }
- static void SSEmul(float* a, float* b, float* c, size_t n) {
- __m128 *pb = (__m128*)b;
- __m128 *pc = (__m128*)c;
- size_t n4 = n / 4;
- for (size_t i = 0; i < n; i++) {
- for (size_t j = 0; j < n; j++) {
- __m128 temp = _mm_set1_ps(a[i * n + j]);
- for (size_t k = 0; k < n4; k++) {
- pc[i * n4 + k] = _mm_add_ps(pc[i * n4 + k], _mm_mul_ps(temp, pb[j * n4 + k]));
- }
- }
- }
- }
- static void strassen(float* a, float* b, float* c, size_t n) {
- size_t half = n / 2;
- if (n <= 512) {
- mul(a, b, c, n);
- return;
- }
- float* A11 = new float[half * half];
- float* A12 = new float[half * half];
- float* A21 = new float[half * half];
- float* A22 = new float[half * half];
- float* B11 = new float[half * half];
- float* B12 = new float[half * half];
- float* B21 = new float[half * half];
- float* B22 = new float[half * half];
- float* C11 = new float[half * half];
- float* C12 = new float[half * half];
- float* C21 = new float[half * half];
- float* C22 = new float[half * half];
- float* P1 = new float[half * half];
- float* P2 = new float[half * half];
- float* P3 = new float[half * half];
- float* P4 = new float[half * half];
- float* P5 = new float[half * half];
- float* P6 = new float[half * half];
- float* P7 = new float[half * half];
- float* ARes = new float[half * half];
- float* BRes = new float[half * half];
- for (size_t i = 0; i < half; i++) {
- for (size_t j = 0; j < half; j++) {
- A11[i * half + j] = a[i * n + j];
- A12[i * half + j] = a[i * n + j + half];
- A21[i * half + j] = a[(i + half) * n + j];
- A22[i * half + j] = a[(i + half) * n + j + half];
- B11[i * half + j] = b[i * n + j];
- B12[i * half + j] = b[i * n + j + half];
- B21[i * half + j] = b[(i + half) * n + j];
- B22[i * half + j] = b[(i + half) * n + j + half];
- }
- }
- //P1
- add(A11, A22, ARes, half);
- add(B11, B22, BRes, half);
- strassen(ARes, BRes, P1, half);
- //P2
- add(A21, A22, ARes, half);
- strassen(ARes, B11, P2, half);
- //P3
- sub(B12, B22, BRes, half);
- strassen(A11, BRes, P3, half);
- //P4
- sub(B21, B11, BRes, half);
- strassen(A22, BRes, P4, half);
- //P5
- add(A11, A12, ARes, half);
- strassen(ARes, B22, P5, half);
- //P6
- sub(A21, A11, ARes, half);
- add(B11, B12, BRes, half);
- strassen(ARes, BRes, P6, half);
- delete[] A21;
- delete[] A11;
- delete[] B11;
- delete[] B12;
- //P7
- sub(A12, A22, ARes, half);
- add(B21, B22, BRes, half);
- strassen(ARes, BRes, P7, half);
- delete[] A12;
- delete[] A22;
- delete[] B21;
- delete[] B22;
- //C11
- add(P1, P4, ARes, half);
- sub(P7, P5, BRes, half);
- add(ARes, BRes, C11, half);
- delete[] P7;
- //C12
- add(P3, P5, C12, half);
- delete[] P5;
- //C21
- add(P2, P4, C21, half);
- delete[] P4;
- //C22
- add(P1, P3, ARes, half);
- sub(P6, P2, BRes, half);
- add(ARes, BRes, C22, half);
- delete[] P1;
- delete[] P3;
- delete[] P6;
- delete[] P2;
- for (size_t i = 0; i < half; i++) {
- for (size_t j = 0; j < half; j++) {
- c[i * n + j] = C11[i * half + j];
- c[i * n + j + half] = C12[i * half + j];
- c[(i + half) * n + j] = C21[i * half + j];
- c[(i + half) * n + j + half] = C22[i * half + j];
- }
- }
- }
- static void SSEstrassen(float* a, float* b, float* c, size_t n) {
- size_t half = n / 2;
- if (n <= 512) {
- SSEmul(a, b, c, n);
- return;
- }
- float* A11 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
- float* A12 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
- float* A21 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
- float* A22 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
- float* B11 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
- float* B12 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
- float* B21 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
- float* B22 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
- float* C11 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
- float* C12 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
- float* C21 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
- float* C22 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
- float* P1 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
- float* P2 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
- float* P3 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
- float* P4 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
- float* P5 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
- float* P6 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
- float* P7 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
- float* ARes = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
- float* BRes = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
- for (size_t i = 0; i < half; i++) {
- for (size_t j = 0; j < half; j++) {
- A11[i * half + j] = a[i * n + j];
- A12[i * half + j] = a[i * n + j + half];
- A21[i * half + j] = a[(i + half) * n + j];
- A22[i * half + j] = a[(i + half) * n + j + half];
- B11[i * half + j] = b[i * n + j];
- B12[i * half + j] = b[i * n + j + half];
- B21[i * half + j] = b[(i + half) * n + j];
- B22[i * half + j] = b[(i + half) * n + j + half];
- }
- }
- //P1
- SSEadd(A11, A22, ARes, half);
- SSEadd(B11, B22, BRes, half);
- SSEstrassen(ARes, BRes, P1, half);
- //P2
- SSEadd(A21, A22, ARes, half);
- SSEstrassen(ARes, B11, P2, half);
- //P3
- SSEsub(B12, B22, BRes, half);
- SSEstrassen(A11, BRes, P3, half);
- //P4
- SSEsub(B21, B11, BRes, half);
- SSEstrassen(A22, BRes, P4, half);
- //P5
- SSEadd(A11, A12, ARes, half);
- SSEstrassen(ARes, B22, P5, half);
- //P6
- SSEsub(A21, A11, ARes, half);
- SSEadd(B11, B12, BRes, half);
- SSEstrassen(ARes, BRes, P6, half);
- //P7
- SSEsub(A12, A22, ARes, half);
- SSEadd(B21, B22, BRes, half);
- SSEstrassen(ARes, BRes, P7, half);
- //C11
- SSEadd(P1, P4, ARes, half);
- SSEsub(P7, P5, BRes, half);
- SSEadd(ARes, BRes, C11, half);
- //C12
- SSEadd(P3, P5, C12, half);
- //C21
- SSEadd(P2, P4, C21, half);
- //C22
- SSEadd(P1, P3, ARes, half);
- SSEsub(P6, P2, BRes, half);
- SSEadd(ARes, BRes, C22, half);
- for (size_t i = 0; i < half; i++) {
- for (size_t j = 0; j < half; j++) {
- c[i * n + j] = C11[i * half + j];
- c[i * n + j + half] = C12[i * half + j];
- c[(i + half) * n + j] = C21[i * half + j];
- c[(i + half) * n + j + half] = C22[i * half + j];
- }
- }
- }
- static void invert(float *A, float* E, int N)
- {
- float temp;
- for (int i = 0; i < N; i++) {
- for (int j = 0; j < N; j++) {
- E[i * N + j] = 0.0f;
- if (i == j)
- E[i * N + j] = 1.0f;
- }
- }
- //added swap
- for (int k = 0; k < N; k++) {
- temp = A[k * N + k];
- if (abs(A[k * N + k] - eps) < 0) {
- int t = 0;
- for (int i = 0; i > N; i++) {
- if (abs(A[k * N + k] - eps) < 0) {
- swap(A, N, i, k);
- t = 1;
- }
- }
- if (t == 0) {
- _tprintf(_T("matrix cannot be inverted"));
- return;
- }
- }
- for (int j = 0; j < N; j++) {
- A[k * N + j] /= temp;
- E[k * N + j] /= temp;
- }
- for (int i = k + 1; i < N; i++) {
- temp = A[i * N + k];
- for (int j = 0; j < N; j++) {
- A[i * N + j] -= A[k * N + j] * temp;
- E[i * N + j] -= E[k * N + j] * temp;
- }
- }
- }
- for (int k = N - 1; k > 0; k--) {
- for (int i = k - 1; i >= 0; i--) {
- temp = A[i * N + k];
- for (int j = 0; j < N; j++) {
- A[i * N + j] -= A[k * N + j] * temp;
- E[i * N + j] -= E[k * N + j] * temp;
- }
- }
- }
- }
- static void SSEinvert(float *A, float* E, int N)
- {
- float tmp;
- memset(E, 0, N * N * sizeof(float));
- for (int i = 0; i < N; i++) {
- for (int j = 0; j < N; j++) {
- if (i == j)
- E[i * N + j] = 1.0f;
- }
- }
- __m128* pA = (__m128*) A;
- __m128* pE = (__m128*) E;
- for (int k = 0; k < N; k++) {
- __m128 temp = _mm_set1_ps(A[k * N + k]);
- for (int j = 0; j < N; j++) {
- pA[k * N + j] = _mm_div_ps(pA[k * N + j], temp);
- pE[k * N + j] = _mm_div_ps(pE[k * N + j], temp);
- pA[k * N + j + 1] = _mm_div_ps(pA[k * N + j + 1], temp);
- pE[k * N + j + 1] = _mm_div_ps(pE[k * N + j + 1], temp);
- pA[k * N + j + 2] = _mm_div_ps(pA[k * N + j + 2], temp);
- pE[k * N + j + 2] = _mm_div_ps(pE[k * N + j + 2], temp);
- pA[k * N + j + 3] = _mm_div_ps(pA[k * N + j + 3], temp);
- pE[k * N + j + 3] = _mm_div_ps(pE[k * N + j + 3], temp);
- }
- for (int i = k + 1; i < N; i++) {
- tmp = A[i * N + k];
- for (int j = 0; j < N; j++) {
- A[i * N + j] -= A[k * N + j] * tmp;
- E[i * N + j] -= E[k * N + j] * tmp;
- }
- }
- }
- for (int k = N - 1; k > 0; k--) {
- for (int i = k - 1; i >= 0; i--) {
- tmp = A[i * N + k];
- for (int j = 0; j < N; j++) {
- A[i * N + j] -= A[k * N + j] * tmp;
- E[i * N + j] -= E[k * N + j] * tmp;
- }
- }
- }
- }
- static void div(float *a, float *b, float *c, int n) {
- float* d = new float[n * n];
- float* e = new float[n * n];
- for (int i = 0; i < n; i++) {
- for (int j = 0; j < n; j++) {
- d[i * n + j] = b[i * n + j];
- }
- }
- invert(d, e, n);
- mul(a, d, c, n);
- }
- static void divStrassen(float *a, float *b, float *c, int n) {
- float* d = new float[n * n];
- float* e = new float[n * n];
- for (int i = 0; i < n; i++) {
- for (int j = 0; j < n; j++) {
- d[i * n + j] = b[i * n + j];
- }
- }
- invert(d, e, n);
- strassen(a, d, c, n);
- }
- static void SSEdiv(float *a, float *b, float *c, int n) {
- float* d = (float*)_aligned_malloc(sizeof(float) * n * n, 16);
- float* e = (float*)_aligned_malloc(sizeof(float) * n * n, 16);
- for (int i = 0; i < n; i++) {
- for (int j = 0; j < n; j++) {
- d[i * n + j] = b[i * n + j];
- }
- }
- SSEinvert(d, e, n);
- SSEstrassen(a, d, c, n);
- }
- static bool cmp(float *a, float *b, size_t n, float epsilon) {
- for (size_t i = 0; i < n; i++) {
- for (size_t j = 0; j < n; j++) {
- if (abs(a[i * n + j] - b[i * n + j]) > epsilon) {
- _tprintf(_T(" [%f != %f] "), a[i * n + j], b[i * n + j]);
- return false;
- }
- }
- }
- return true;
- }
- };
- static void TestAddSub(LONGLONG fr) {
- for (size_t i = 32, k = 30; i <= 512; i <<= 1) {
- double addtime = 0xFFFFFFFF;
- double subtime = 0xFFFFFFFF;
- float* a = new float[i * i];
- float* b = new float[i * i];
- float* c = new float[i * i];
- float* d = new float[i * i];
- for (size_t j = 0; j < k; j++) {
- Matrix::randomize(a, i);
- Matrix::randomize(b, i);
- StartTimer();
- Matrix::add(a, b, c, i);
- EndTimer(fr, &sec);
- addtime = sec < addtime ? sec : addtime;
- StartTimer();
- Matrix::sub(c, b, d, i);
- EndTimer(fr, &sec);
- subtime = sec < subtime ? sec : subtime;
- if (!Matrix::cmp(a, d, i, eps)) {
- _tprintf(_T("\n FAILED TO ADD/SUB MATRICES"));
- }
- }
- _tprintf(_T("\n add [%d]:\t%f"), i, addtime);
- _tprintf(_T("\n sub [%d]:\t%f\n"), i, subtime);
- }
- for (size_t i = 1024; i <= 2048; i <<= 1) {
- float* a = new float[i * i];
- float* b = new float[i * i];
- float* c = new float[i * i];
- float* d = new float[i * i];
- Matrix::randomize(a, i);
- Matrix::randomize(b, i);
- StartTimer();
- Matrix::add(a, b, c, i);
- EndTimer(fr, &sec);
- _tprintf(_T("\n add [%d]:\t%f"), i, sec);
- StartTimer();
- Matrix::sub(c, b, d, i);
- EndTimer(fr, &sec);
- _tprintf(_T("\n sub [%d]:\t%f\n"), i, sec);
- if (!Matrix::cmp(a, d, i, eps)) {
- _tprintf(_T("\n FAILED TO ADD/SUB MATRICES"));
- }
- }
- }
- static void TestMulStr(LONGLONG fr) {
- for (size_t i = 32, k = 30; i <= 256; i <<= 1) {
- double multime = 0xFFFFFFFF;
- double strtime = 0xFFFFFFFF;
- float* a = new float[i * i];
- float* b = new float[i * i];
- float* c = new float[i * i];
- float* d = new float[i * i];
- for (size_t j = 0; j < k; j++) {
- Matrix::randomize(a, i);
- Matrix::randomize(b, i);
- StartTimer();
- Matrix::mul(a, b, c, i);
- EndTimer(fr, &sec);
- multime = sec < multime ? sec : multime;
- StartTimer();
- Matrix::strassen(a, b, d, i);
- EndTimer(fr, &sec);
- strtime = sec < strtime ? sec : strtime;
- if (!Matrix::cmp(c, d, i, eps)) {
- _tprintf(_T("\n FAILED TO MUL MATRICES"));
- }
- }
- _tprintf(_T("\n mul [%d]:\t%f"), i, multime);
- _tprintf(_T("\n str [%d]:\t%f\n"), i, strtime);
- }
- for (size_t i = 512; i <= 2048; i <<= 1) {
- float* a = new float[i * i];
- float* b = new float[i * i];
- float* c = new float[i * i];
- float* d = new float[i * i];
- Matrix::randomize(a, i);
- Matrix::randomize(b, i);
- StartTimer();
- Matrix::mul(a, b, c, i);
- EndTimer(fr, &sec);
- _tprintf(_T("\n mul [%d]:\t%f"), i, sec);
- StartTimer();
- Matrix::strassen(a, b, d, i);
- EndTimer(fr, &sec);
- _tprintf(_T("\n str [%d]:\t%f\n"), i, sec);
- if (!Matrix::cmp(c, d, i, eps)) {
- _tprintf(_T("\n FAILED TO MUL MATRICES"));
- }
- }
- }
- static void TestSSEadd(LONGLONG fr) {
- for (size_t i = 32, k = 30; i <= 2048; i <<= 1) {
- double addtime = 0xFFFFFFFF;
- double ssetime = 0xFFFFFFFF;
- float* a = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- float* b = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- float* c = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- float* d = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- for (size_t j = 0; j < k; j++) {
- Matrix::randomize(a, i);
- Matrix::randomize(b, i);
- StartTimer();
- Matrix::add(a, b, c, i);
- EndTimer(fr, &sec);
- addtime = sec < addtime ? sec : addtime;
- StartTimer();
- Matrix::SSEadd(a, b, d, i);
- EndTimer(fr, &sec);
- ssetime = sec < ssetime ? sec : ssetime;
- if (!Matrix::cmp(c, d, i, eps)) {
- _tprintf(_T("\n FAILED TO ADD MATRICES"));
- }
- }
- _tprintf(_T("\n add [%d]:\t%f"), i, addtime);
- _tprintf(_T("\n sse [%d]:\t%f\n"), i, ssetime);
- }
- }
- static void TestSSEsub(LONGLONG fr) {
- for (size_t i = 32, k = 30; i <= 2048; i <<= 1) {
- double subtime = 0xFFFFFFFF;
- double ssetime = 0xFFFFFFFF;
- float* a = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- float* b = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- float* c = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- float* d = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- for (size_t j = 0; j < k; j++) {
- Matrix::randomize(a, i);
- Matrix::randomize(b, i);
- StartTimer();
- Matrix::sub(a, b, c, i);
- EndTimer(fr, &sec);
- subtime = sec < subtime ? sec : subtime;
- StartTimer();
- Matrix::SSEsub(a, b, d, i);
- EndTimer(fr, &sec);
- ssetime = sec < ssetime ? sec : ssetime;
- if (!Matrix::cmp(c, d, i, eps)) {
- _tprintf(_T("\n FAILED TO SUB MATRICES"));
- }
- }
- _tprintf(_T("\n sub [%d]:\t%f"), i, subtime);
- _tprintf(_T("\n sse [%d]:\t%f\n"), i, ssetime);
- }
- }
- static void TestSSEmul(LONGLONG fr) {
- for (size_t i = 32, k = 30; i <= 256; i <<= 1) {
- double multime = 0xFFFFFFFF;
- double ssetime = 0xFFFFFFFF;
- float* a = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- float* b = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- float* c = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- float* d = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- for (size_t j = 0; j < k; j++) {
- Matrix::randomize(a, i);
- Matrix::randomize(b, i);
- StartTimer();
- Matrix::mul(a, b, c, i);
- EndTimer(fr, &sec);
- multime = sec < multime ? sec : multime;
- StartTimer();
- Matrix::SSEmul(a, b, d, i);
- EndTimer(fr, &sec);
- ssetime = sec < ssetime ? sec : ssetime;
- if (!Matrix::cmp(c, d, i, eps)) {
- _tprintf(_T("\n FAILED TO MUL MATRICES"));
- }
- }
- _tprintf(_T("\n mul [%d]:\t%f"), i, multime);
- _tprintf(_T("\n sse [%d]:\t%f\n"), i, ssetime);
- }
- for (size_t i = 512; i <= 2048; i <<= 1) {
- float* a = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- float* b = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- float* c = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- float* d = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- Matrix::randomize(a, i);
- Matrix::randomize(b, i);
- StartTimer();
- Matrix::mul(a, b, c, i);
- EndTimer(fr, &sec);
- _tprintf(_T("\n mul [%d]:\t%f"), i, sec);
- StartTimer();
- Matrix::SSEmul(a, b, d, i);
- EndTimer(fr, &sec);
- _tprintf(_T("\n sse [%d]:\t%f\n"), i, sec);
- if (!Matrix::cmp(c, d, i, eps)) {
- _tprintf(_T("\n FAILED TO MUL MATRICES"));
- }
- }
- }
- static void TestSSEstrassen(LONGLONG fr) {
- for (size_t i = 32, k = 30; i <= 256; i <<= 1) {
- double multime = 0xFFFFFFFF;
- double ssetime = 0xFFFFFFFF;
- float* a = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- float* b = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- float* c = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- float* d = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- for (size_t j = 0; j < k; j++) {
- Matrix::randomize(a, i);
- Matrix::randomize(b, i);
- StartTimer();
- Matrix::strassen(a, b, c, i);
- EndTimer(fr, &sec);
- multime = sec < multime ? sec : multime;
- StartTimer();
- Matrix::SSEstrassen(a, b, d, i);
- EndTimer(fr, &sec);
- ssetime = sec < ssetime ? sec : ssetime;
- if (!Matrix::cmp(c, d, i, eps)) {
- _tprintf(_T("\n FAILED TO MUL MATRICES"));
- }
- }
- _tprintf(_T("\n str [%d]:\t%f"), i, multime);
- _tprintf(_T("\n sse [%d]:\t%f\n"), i, ssetime);
- }
- for (size_t i = 512; i <= 2048; i <<= 1) {
- float* a = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- float* b = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- float* c = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- float* d = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- Matrix::randomize(a, i);
- Matrix::randomize(b, i);
- StartTimer();
- Matrix::strassen(a, b, c, i);
- EndTimer(fr, &sec);
- _tprintf(_T("\n str [%d]:\t%f"), i, sec);
- StartTimer();
- Matrix::SSEstrassen(a, b, d, i);
- EndTimer(fr, &sec);
- _tprintf(_T("\n sse [%d]:\t%f\n"), i, sec);
- if (!Matrix::cmp(c, d, i, eps)) {
- _tprintf(_T("\n FAILED TO MUL MATRICES"));
- }
- }
- }
- static void TestSSEdiv(LONGLONG fr) {
- for (size_t i = 32; i <= 2048; i <<= 1) {
- float* a = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- float* b = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- float* c = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- float* d = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
- Matrix::randomize(a, i);
- Matrix::randomize(b, i);
- StartTimer();
- Matrix::divStrassen(a, b, c, i);
- EndTimer(fr, &sec);
- _tprintf(_T("\n div [%d]:\t%f"), i, sec);
- StartTimer();
- Matrix::SSEdiv(a, b, d, i);
- EndTimer(fr, &sec);
- _tprintf(_T("\n sse [%d]:\t%f\n"), i, sec);
- if (!Matrix::cmp(c, d, i, eps)) {
- _tprintf(_T("\n FAILED TO DIV MATRICES"));
- }
- }
- }
- static void TestDiv(LONGLONG fr) {
- for (size_t i = 32; i <= 2048; i <<= 1) {
- uint64 multime = 0xFFFFFFFF;
- float* a = new float[i * i];
- float* b = new float[i * i];
- float* c = new float[i * i];
- Matrix::randomize(a, i);
- Matrix::randomize(b, i);
- StartTimer();
- Matrix::div(a, b, c, i);
- EndTimer(fr, &sec);
- _tprintf(_T("\n div [%d]:\t%f"), i, sec);
- }
- }
- static void Test(LONGLONG fr) {
- /*_tprintf(_T("TestAddSub(fr)\n"));
- TestAddSub(fr);
- _tprintf(_T("\n\nTestMulStr(fr)\n"));
- TestMulStr(fr);
- _tprintf(_T("\n\nTestSSEadd(fr)\n"));
- TestSSEadd(fr);
- _tprintf(_T("\n\nTestSSEsub(fr)\n"));
- TestSSEsub(fr);
- _tprintf(_T("\n\nTestSSEmul(fr)\n"));
- TestSSEmul(fr);
- _tprintf(_T("\n\nTestSSEstrassen(fr)\n"));
- TestSSEstrassen(fr);*/
- _tprintf(_T("\n\nTestSSEdiv(fr)\n"));
- TestSSEdiv(fr);
- /*_tprintf(_T("\n\nTestDiv(fr)\n"));
- TestDiv(fr);*/
- }
- int main() {
- srand((unsigned int)time(0));
- QPF(&frequency);
- LONGLONG fr = frequency.QuadPart;
- Test(fr);
- _getch();
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement