Advertisement
Guest User

Untitled

a guest
Oct 31st, 2014
143
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 22.78 KB | None | 0 0
  1. // matrix.cpp : main project file.
  2.  
  3. #include "stdafx.h"
  4. #include <conio.h>
  5. #include <Windows.h>
  6. #include <tchar.h>
  7. #include <time.h>
  8. #include <cmath>
  9. #include <intrin.h>
  10. #include <immintrin.h>
  11.  
  12. #define QPF(f) QueryPerformanceFrequency(f)
  13. #define QPC(f) QueryPerformanceCounter(f)
  14. #define eps 0.5f
  15.  
  16. typedef unsigned __int64 uint64;
  17.  
  18. using namespace System;
  19. using namespace std;
  20.  
  21. LARGE_INTEGER frequency, st, f;
  22. uint64 start, finish = 0xFFFFFFFF;
  23. double sec;
  24.  
  25. static void StartTimer() {
  26. QPC(&st);
  27. start = st.QuadPart;
  28. }
  29.  
  30. static void EndTimer(LONGLONG fr, double *sec) {
  31. QPC(&f);
  32. finish = f.QuadPart;
  33. *sec = (finish - start) / (double)fr;
  34. }
  35.  
  36. class Matrix {
  37.  
  38. public:
  39.  
  40. static void randomize(float* a, size_t n) {
  41. for (size_t i = 0; i < n; i++) {
  42. for (size_t j = 0; j < n; j++) {
  43. a[i * n + j] = (float)(rand() % 10) + 1.0f;
  44. }
  45. }
  46. }
  47.  
  48. static void print(float* a, size_t n) {
  49. for (size_t i = 0; i < n; i++) {
  50. for (size_t j = 0; j < n; j++) {
  51. _tprintf(_T("%f "), a[i * n + j]);
  52. }
  53. _tprintf(_T("\n"));
  54. }
  55. _tprintf(_T("-----------\n\n"));
  56. }
  57.  
  58. static void swap(float* a, size_t n, size_t r1, size_t r2) {
  59. float tmp;
  60. for (size_t j = 0; j < n; j++) {
  61. tmp = a[n * r1 + j];
  62. a[n * r1 + j] = a[n * r2 + j];
  63. a[n * r2 + j] = tmp;
  64. }
  65. }
  66.  
  67. static void add(float* a, float* b, float* c, size_t n) {
  68. for (size_t i = 0; i < n; i++) {
  69. for (size_t j = 0; j < n; j++) {
  70. c[i * n + j] = a[i * n + j] + b[i * n + j];
  71. }
  72. }
  73. }
  74.  
  75. static void sub(float* a, float* b, float* c, size_t n) {
  76. for (size_t i = 0; i < n; i++) {
  77. for (size_t j = 0; j < n; j++) {
  78. c[i * n + j] = a[i * n + j] - b[i * n + j];
  79. }
  80. }
  81. }
  82.  
  83. static void SSEadd(float* a, float* b, float* c, size_t n) {
  84. __m128* pa = (__m128*) a;
  85. __m128* pb = (__m128*) b;
  86. __m128* pc = (__m128*) c;
  87. for (size_t i = 0; i < n * n / 4; i += 4) {
  88. pc[i] = _mm_add_ps(pa[i], pb[i]);
  89. pc[i + 1] = _mm_add_ps(pa[i + 1], pb[i + 1]);
  90. pc[i + 2] = _mm_add_ps(pa[i + 2], pb[i + 2]);
  91. pc[i + 3] = _mm_add_ps(pa[i + 3], pb[i + 3]);
  92. }
  93. }
  94.  
  95. /*static void AVXadd(float* a, float* b, float* c, size_t n) {
  96. __m256* pa = (__m256*) a;
  97. __m256* pb = (__m256*) b;
  98. __m256* pc = (__m256*) c;
  99. for (size_t i = 0; i < n * n / 8; i ++) {
  100. pc[i] = _mm256_add_ps(pa[i], pb[i]);
  101. }
  102. }*/
  103.  
  104. static void SSEsub(float* a, float* b, float* c, size_t n) {
  105. __m128* pa = (__m128*) a;
  106. __m128* pb = (__m128*) b;
  107. __m128* pc = (__m128*) c;
  108. for (size_t i = 0; i < n * n / 4; i += 4) {
  109. pc[i] = _mm_sub_ps(pa[i], pb[i]);
  110. pc[i + 1] = _mm_sub_ps(pa[i + 1], pb[i + 1]);
  111. pc[i + 2] = _mm_sub_ps(pa[i + 2], pb[i + 2]);
  112. pc[i + 3] = _mm_sub_ps(pa[i + 3], pb[i + 3]);
  113. }
  114. }
  115.  
  116. static void mul(float* a, float* b, float* c, size_t n) {
  117. for (size_t i = 0; i < n; i++) {
  118. for (size_t j = 0; j < n; j++) {
  119. for (size_t k = 0; k < n; k++) {
  120. c[i * n + k] += a[i * n + j] * b[j * n + k];
  121. }
  122. }
  123. }
  124. }
  125.  
  126. static void SSEmul(float* a, float* b, float* c, size_t n) {
  127. __m128 *pb = (__m128*)b;
  128. __m128 *pc = (__m128*)c;
  129. size_t n4 = n / 4;
  130.  
  131. for (size_t i = 0; i < n; i++) {
  132. for (size_t j = 0; j < n; j++) {
  133. __m128 temp = _mm_set1_ps(a[i * n + j]);
  134. for (size_t k = 0; k < n4; k++) {
  135. pc[i * n4 + k] = _mm_add_ps(pc[i * n4 + k], _mm_mul_ps(temp, pb[j * n4 + k]));
  136. }
  137. }
  138. }
  139. }
  140.  
  141. static void strassen(float* a, float* b, float* c, size_t n) {
  142. size_t half = n / 2;
  143.  
  144. if (n <= 512) {
  145. mul(a, b, c, n);
  146. return;
  147. }
  148.  
  149. float* A11 = new float[half * half];
  150. float* A12 = new float[half * half];
  151. float* A21 = new float[half * half];
  152. float* A22 = new float[half * half];
  153.  
  154. float* B11 = new float[half * half];
  155. float* B12 = new float[half * half];
  156. float* B21 = new float[half * half];
  157. float* B22 = new float[half * half];
  158.  
  159. float* C11 = new float[half * half];
  160. float* C12 = new float[half * half];
  161. float* C21 = new float[half * half];
  162. float* C22 = new float[half * half];
  163.  
  164. float* P1 = new float[half * half];
  165. float* P2 = new float[half * half];
  166. float* P3 = new float[half * half];
  167. float* P4 = new float[half * half];
  168. float* P5 = new float[half * half];
  169. float* P6 = new float[half * half];
  170. float* P7 = new float[half * half];
  171.  
  172. float* ARes = new float[half * half];
  173. float* BRes = new float[half * half];
  174.  
  175. for (size_t i = 0; i < half; i++) {
  176. for (size_t j = 0; j < half; j++) {
  177. A11[i * half + j] = a[i * n + j];
  178. A12[i * half + j] = a[i * n + j + half];
  179. A21[i * half + j] = a[(i + half) * n + j];
  180. A22[i * half + j] = a[(i + half) * n + j + half];
  181.  
  182. B11[i * half + j] = b[i * n + j];
  183. B12[i * half + j] = b[i * n + j + half];
  184. B21[i * half + j] = b[(i + half) * n + j];
  185. B22[i * half + j] = b[(i + half) * n + j + half];
  186. }
  187. }
  188.  
  189. //P1
  190. add(A11, A22, ARes, half);
  191. add(B11, B22, BRes, half);
  192. strassen(ARes, BRes, P1, half);
  193.  
  194. //P2
  195. add(A21, A22, ARes, half);
  196. strassen(ARes, B11, P2, half);
  197.  
  198. //P3
  199. sub(B12, B22, BRes, half);
  200. strassen(A11, BRes, P3, half);
  201.  
  202. //P4
  203. sub(B21, B11, BRes, half);
  204. strassen(A22, BRes, P4, half);
  205.  
  206. //P5
  207. add(A11, A12, ARes, half);
  208. strassen(ARes, B22, P5, half);
  209.  
  210. //P6
  211. sub(A21, A11, ARes, half);
  212. add(B11, B12, BRes, half);
  213. strassen(ARes, BRes, P6, half);
  214. delete[] A21;
  215. delete[] A11;
  216. delete[] B11;
  217. delete[] B12;
  218.  
  219. //P7
  220. sub(A12, A22, ARes, half);
  221. add(B21, B22, BRes, half);
  222. strassen(ARes, BRes, P7, half);
  223. delete[] A12;
  224. delete[] A22;
  225. delete[] B21;
  226. delete[] B22;
  227.  
  228. //C11
  229. add(P1, P4, ARes, half);
  230. sub(P7, P5, BRes, half);
  231. add(ARes, BRes, C11, half);
  232. delete[] P7;
  233.  
  234. //C12
  235. add(P3, P5, C12, half);
  236. delete[] P5;
  237.  
  238. //C21
  239. add(P2, P4, C21, half);
  240. delete[] P4;
  241.  
  242. //C22
  243. add(P1, P3, ARes, half);
  244. sub(P6, P2, BRes, half);
  245. add(ARes, BRes, C22, half);
  246. delete[] P1;
  247. delete[] P3;
  248. delete[] P6;
  249. delete[] P2;
  250.  
  251. for (size_t i = 0; i < half; i++) {
  252. for (size_t j = 0; j < half; j++) {
  253. c[i * n + j] = C11[i * half + j];
  254. c[i * n + j + half] = C12[i * half + j];
  255. c[(i + half) * n + j] = C21[i * half + j];
  256. c[(i + half) * n + j + half] = C22[i * half + j];
  257. }
  258. }
  259. }
  260.  
  261. static void SSEstrassen(float* a, float* b, float* c, size_t n) {
  262. size_t half = n / 2;
  263.  
  264. if (n <= 512) {
  265. SSEmul(a, b, c, n);
  266. return;
  267. }
  268.  
  269. float* A11 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
  270. float* A12 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
  271. float* A21 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
  272. float* A22 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
  273.  
  274. float* B11 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
  275. float* B12 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
  276. float* B21 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
  277. float* B22 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
  278.  
  279. float* C11 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
  280. float* C12 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
  281. float* C21 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
  282. float* C22 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
  283.  
  284. float* P1 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
  285. float* P2 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
  286. float* P3 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
  287. float* P4 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
  288. float* P5 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
  289. float* P6 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
  290. float* P7 = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
  291.  
  292. float* ARes = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
  293. float* BRes = (float*)_aligned_malloc(sizeof(float) * half * half, 16);
  294.  
  295. for (size_t i = 0; i < half; i++) {
  296. for (size_t j = 0; j < half; j++) {
  297. A11[i * half + j] = a[i * n + j];
  298. A12[i * half + j] = a[i * n + j + half];
  299. A21[i * half + j] = a[(i + half) * n + j];
  300. A22[i * half + j] = a[(i + half) * n + j + half];
  301.  
  302. B11[i * half + j] = b[i * n + j];
  303. B12[i * half + j] = b[i * n + j + half];
  304. B21[i * half + j] = b[(i + half) * n + j];
  305. B22[i * half + j] = b[(i + half) * n + j + half];
  306. }
  307. }
  308.  
  309. //P1
  310. SSEadd(A11, A22, ARes, half);
  311. SSEadd(B11, B22, BRes, half);
  312. SSEstrassen(ARes, BRes, P1, half);
  313.  
  314. //P2
  315. SSEadd(A21, A22, ARes, half);
  316. SSEstrassen(ARes, B11, P2, half);
  317.  
  318. //P3
  319. SSEsub(B12, B22, BRes, half);
  320. SSEstrassen(A11, BRes, P3, half);
  321.  
  322. //P4
  323. SSEsub(B21, B11, BRes, half);
  324. SSEstrassen(A22, BRes, P4, half);
  325.  
  326. //P5
  327. SSEadd(A11, A12, ARes, half);
  328. SSEstrassen(ARes, B22, P5, half);
  329.  
  330. //P6
  331. SSEsub(A21, A11, ARes, half);
  332. SSEadd(B11, B12, BRes, half);
  333. SSEstrassen(ARes, BRes, P6, half);
  334.  
  335. //P7
  336. SSEsub(A12, A22, ARes, half);
  337. SSEadd(B21, B22, BRes, half);
  338. SSEstrassen(ARes, BRes, P7, half);
  339.  
  340. //C11
  341. SSEadd(P1, P4, ARes, half);
  342. SSEsub(P7, P5, BRes, half);
  343. SSEadd(ARes, BRes, C11, half);
  344.  
  345. //C12
  346. SSEadd(P3, P5, C12, half);
  347.  
  348. //C21
  349. SSEadd(P2, P4, C21, half);
  350.  
  351. //C22
  352. SSEadd(P1, P3, ARes, half);
  353. SSEsub(P6, P2, BRes, half);
  354. SSEadd(ARes, BRes, C22, half);
  355.  
  356. for (size_t i = 0; i < half; i++) {
  357. for (size_t j = 0; j < half; j++) {
  358. c[i * n + j] = C11[i * half + j];
  359. c[i * n + j + half] = C12[i * half + j];
  360. c[(i + half) * n + j] = C21[i * half + j];
  361. c[(i + half) * n + j + half] = C22[i * half + j];
  362. }
  363. }
  364. }
  365.  
  366. static void invert(float *A, float* E, int N)
  367. {
  368. float temp;
  369.  
  370. for (int i = 0; i < N; i++) {
  371. for (int j = 0; j < N; j++) {
  372. E[i * N + j] = 0.0f;
  373. if (i == j)
  374. E[i * N + j] = 1.0f;
  375. }
  376. }
  377. //added swap
  378. for (int k = 0; k < N; k++) {
  379. temp = A[k * N + k];
  380. if (abs(A[k * N + k] - eps) < 0) {
  381. int t = 0;
  382. for (int i = 0; i > N; i++) {
  383. if (abs(A[k * N + k] - eps) < 0) {
  384. swap(A, N, i, k);
  385. t = 1;
  386. }
  387. }
  388. if (t == 0) {
  389. _tprintf(_T("matrix cannot be inverted"));
  390. return;
  391. }
  392. }
  393. for (int j = 0; j < N; j++) {
  394. A[k * N + j] /= temp;
  395. E[k * N + j] /= temp;
  396. }
  397. for (int i = k + 1; i < N; i++) {
  398. temp = A[i * N + k];
  399. for (int j = 0; j < N; j++) {
  400. A[i * N + j] -= A[k * N + j] * temp;
  401. E[i * N + j] -= E[k * N + j] * temp;
  402. }
  403. }
  404. }
  405. for (int k = N - 1; k > 0; k--) {
  406. for (int i = k - 1; i >= 0; i--) {
  407. temp = A[i * N + k];
  408. for (int j = 0; j < N; j++) {
  409. A[i * N + j] -= A[k * N + j] * temp;
  410. E[i * N + j] -= E[k * N + j] * temp;
  411. }
  412. }
  413. }
  414. }
  415.  
  416. static void SSEinvert(float *A, float* E, int N)
  417. {
  418. float tmp;
  419.  
  420. memset(E, 0, N * N * sizeof(float));
  421. for (int i = 0; i < N; i++) {
  422. for (int j = 0; j < N; j++) {
  423. if (i == j)
  424. E[i * N + j] = 1.0f;
  425. }
  426. }
  427.  
  428. __m128* pA = (__m128*) A;
  429. __m128* pE = (__m128*) E;
  430. for (int k = 0; k < N; k++) {
  431. __m128 temp = _mm_set1_ps(A[k * N + k]);
  432.  
  433. for (int j = 0; j < N; j++) {
  434. pA[k * N + j] = _mm_div_ps(pA[k * N + j], temp);
  435. pE[k * N + j] = _mm_div_ps(pE[k * N + j], temp);
  436.  
  437. pA[k * N + j + 1] = _mm_div_ps(pA[k * N + j + 1], temp);
  438. pE[k * N + j + 1] = _mm_div_ps(pE[k * N + j + 1], temp);
  439.  
  440. pA[k * N + j + 2] = _mm_div_ps(pA[k * N + j + 2], temp);
  441. pE[k * N + j + 2] = _mm_div_ps(pE[k * N + j + 2], temp);
  442.  
  443. pA[k * N + j + 3] = _mm_div_ps(pA[k * N + j + 3], temp);
  444. pE[k * N + j + 3] = _mm_div_ps(pE[k * N + j + 3], temp);
  445. }
  446. for (int i = k + 1; i < N; i++) {
  447. tmp = A[i * N + k];
  448. for (int j = 0; j < N; j++) {
  449. A[i * N + j] -= A[k * N + j] * tmp;
  450. E[i * N + j] -= E[k * N + j] * tmp;
  451. }
  452. }
  453. }
  454. for (int k = N - 1; k > 0; k--) {
  455. for (int i = k - 1; i >= 0; i--) {
  456. tmp = A[i * N + k];
  457. for (int j = 0; j < N; j++) {
  458. A[i * N + j] -= A[k * N + j] * tmp;
  459. E[i * N + j] -= E[k * N + j] * tmp;
  460. }
  461. }
  462. }
  463. }
  464.  
  465. static void div(float *a, float *b, float *c, int n) {
  466. float* d = new float[n * n];
  467. float* e = new float[n * n];
  468. for (int i = 0; i < n; i++) {
  469. for (int j = 0; j < n; j++) {
  470. d[i * n + j] = b[i * n + j];
  471. }
  472. }
  473. invert(d, e, n);
  474. mul(a, d, c, n);
  475. }
  476.  
  477. static void divStrassen(float *a, float *b, float *c, int n) {
  478. float* d = new float[n * n];
  479. float* e = new float[n * n];
  480. for (int i = 0; i < n; i++) {
  481. for (int j = 0; j < n; j++) {
  482. d[i * n + j] = b[i * n + j];
  483. }
  484. }
  485. invert(d, e, n);
  486. strassen(a, d, c, n);
  487. }
  488.  
  489. static void SSEdiv(float *a, float *b, float *c, int n) {
  490. float* d = (float*)_aligned_malloc(sizeof(float) * n * n, 16);
  491. float* e = (float*)_aligned_malloc(sizeof(float) * n * n, 16);
  492. for (int i = 0; i < n; i++) {
  493. for (int j = 0; j < n; j++) {
  494. d[i * n + j] = b[i * n + j];
  495. }
  496. }
  497. SSEinvert(d, e, n);
  498. SSEstrassen(a, d, c, n);
  499. }
  500.  
  501. static bool cmp(float *a, float *b, size_t n, float epsilon) {
  502. for (size_t i = 0; i < n; i++) {
  503. for (size_t j = 0; j < n; j++) {
  504. if (abs(a[i * n + j] - b[i * n + j]) > epsilon) {
  505. _tprintf(_T(" [%f != %f] "), a[i * n + j], b[i * n + j]);
  506. return false;
  507. }
  508. }
  509. }
  510. return true;
  511. }
  512. };
  513.  
  514.  
  515. static void TestAddSub(LONGLONG fr) {
  516. for (size_t i = 32, k = 30; i <= 512; i <<= 1) {
  517. double addtime = 0xFFFFFFFF;
  518. double subtime = 0xFFFFFFFF;
  519.  
  520. float* a = new float[i * i];
  521. float* b = new float[i * i];
  522. float* c = new float[i * i];
  523. float* d = new float[i * i];
  524. for (size_t j = 0; j < k; j++) {
  525. Matrix::randomize(a, i);
  526. Matrix::randomize(b, i);
  527.  
  528. StartTimer();
  529. Matrix::add(a, b, c, i);
  530. EndTimer(fr, &sec);
  531. addtime = sec < addtime ? sec : addtime;
  532.  
  533.  
  534. StartTimer();
  535. Matrix::sub(c, b, d, i);
  536. EndTimer(fr, &sec);
  537. subtime = sec < subtime ? sec : subtime;
  538.  
  539. if (!Matrix::cmp(a, d, i, eps)) {
  540. _tprintf(_T("\n FAILED TO ADD/SUB MATRICES"));
  541. }
  542. }
  543. _tprintf(_T("\n add [%d]:\t%f"), i, addtime);
  544. _tprintf(_T("\n sub [%d]:\t%f\n"), i, subtime);
  545. }
  546. for (size_t i = 1024; i <= 2048; i <<= 1) {
  547. float* a = new float[i * i];
  548. float* b = new float[i * i];
  549. float* c = new float[i * i];
  550. float* d = new float[i * i];
  551.  
  552. Matrix::randomize(a, i);
  553. Matrix::randomize(b, i);
  554.  
  555. StartTimer();
  556. Matrix::add(a, b, c, i);
  557. EndTimer(fr, &sec);
  558. _tprintf(_T("\n add [%d]:\t%f"), i, sec);
  559.  
  560. StartTimer();
  561. Matrix::sub(c, b, d, i);
  562. EndTimer(fr, &sec);
  563. _tprintf(_T("\n sub [%d]:\t%f\n"), i, sec);
  564.  
  565. if (!Matrix::cmp(a, d, i, eps)) {
  566. _tprintf(_T("\n FAILED TO ADD/SUB MATRICES"));
  567. }
  568. }
  569. }
  570.  
  571. static void TestMulStr(LONGLONG fr) {
  572. for (size_t i = 32, k = 30; i <= 256; i <<= 1) {
  573. double multime = 0xFFFFFFFF;
  574. double strtime = 0xFFFFFFFF;
  575.  
  576. float* a = new float[i * i];
  577. float* b = new float[i * i];
  578. float* c = new float[i * i];
  579. float* d = new float[i * i];
  580. for (size_t j = 0; j < k; j++) {
  581. Matrix::randomize(a, i);
  582. Matrix::randomize(b, i);
  583.  
  584. StartTimer();
  585. Matrix::mul(a, b, c, i);
  586. EndTimer(fr, &sec);
  587. multime = sec < multime ? sec : multime;
  588.  
  589.  
  590. StartTimer();
  591. Matrix::strassen(a, b, d, i);
  592. EndTimer(fr, &sec);
  593. strtime = sec < strtime ? sec : strtime;
  594.  
  595. if (!Matrix::cmp(c, d, i, eps)) {
  596. _tprintf(_T("\n FAILED TO MUL MATRICES"));
  597. }
  598. }
  599. _tprintf(_T("\n mul [%d]:\t%f"), i, multime);
  600. _tprintf(_T("\n str [%d]:\t%f\n"), i, strtime);
  601. }
  602. for (size_t i = 512; i <= 2048; i <<= 1) {
  603. float* a = new float[i * i];
  604. float* b = new float[i * i];
  605. float* c = new float[i * i];
  606. float* d = new float[i * i];
  607.  
  608. Matrix::randomize(a, i);
  609. Matrix::randomize(b, i);
  610.  
  611. StartTimer();
  612. Matrix::mul(a, b, c, i);
  613. EndTimer(fr, &sec);
  614. _tprintf(_T("\n mul [%d]:\t%f"), i, sec);
  615.  
  616. StartTimer();
  617. Matrix::strassen(a, b, d, i);
  618. EndTimer(fr, &sec);
  619. _tprintf(_T("\n str [%d]:\t%f\n"), i, sec);
  620.  
  621. if (!Matrix::cmp(c, d, i, eps)) {
  622. _tprintf(_T("\n FAILED TO MUL MATRICES"));
  623. }
  624. }
  625. }
  626.  
  627. static void TestSSEadd(LONGLONG fr) {
  628. for (size_t i = 32, k = 30; i <= 2048; i <<= 1) {
  629. double addtime = 0xFFFFFFFF;
  630. double ssetime = 0xFFFFFFFF;
  631.  
  632. float* a = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  633. float* b = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  634. float* c = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  635. float* d = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  636. for (size_t j = 0; j < k; j++) {
  637. Matrix::randomize(a, i);
  638. Matrix::randomize(b, i);
  639.  
  640. StartTimer();
  641. Matrix::add(a, b, c, i);
  642. EndTimer(fr, &sec);
  643. addtime = sec < addtime ? sec : addtime;
  644.  
  645.  
  646. StartTimer();
  647. Matrix::SSEadd(a, b, d, i);
  648. EndTimer(fr, &sec);
  649. ssetime = sec < ssetime ? sec : ssetime;
  650.  
  651. if (!Matrix::cmp(c, d, i, eps)) {
  652. _tprintf(_T("\n FAILED TO ADD MATRICES"));
  653. }
  654. }
  655. _tprintf(_T("\n add [%d]:\t%f"), i, addtime);
  656. _tprintf(_T("\n sse [%d]:\t%f\n"), i, ssetime);
  657. }
  658. }
  659.  
  660. static void TestSSEsub(LONGLONG fr) {
  661. for (size_t i = 32, k = 30; i <= 2048; i <<= 1) {
  662. double subtime = 0xFFFFFFFF;
  663. double ssetime = 0xFFFFFFFF;
  664.  
  665. float* a = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  666. float* b = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  667. float* c = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  668. float* d = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  669. for (size_t j = 0; j < k; j++) {
  670. Matrix::randomize(a, i);
  671. Matrix::randomize(b, i);
  672.  
  673. StartTimer();
  674. Matrix::sub(a, b, c, i);
  675. EndTimer(fr, &sec);
  676. subtime = sec < subtime ? sec : subtime;
  677.  
  678.  
  679. StartTimer();
  680. Matrix::SSEsub(a, b, d, i);
  681. EndTimer(fr, &sec);
  682. ssetime = sec < ssetime ? sec : ssetime;
  683.  
  684. if (!Matrix::cmp(c, d, i, eps)) {
  685. _tprintf(_T("\n FAILED TO SUB MATRICES"));
  686. }
  687. }
  688. _tprintf(_T("\n sub [%d]:\t%f"), i, subtime);
  689. _tprintf(_T("\n sse [%d]:\t%f\n"), i, ssetime);
  690. }
  691. }
  692.  
  693. static void TestSSEmul(LONGLONG fr) {
  694. for (size_t i = 32, k = 30; i <= 256; i <<= 1) {
  695. double multime = 0xFFFFFFFF;
  696. double ssetime = 0xFFFFFFFF;
  697.  
  698. float* a = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  699. float* b = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  700. float* c = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  701. float* d = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  702. for (size_t j = 0; j < k; j++) {
  703. Matrix::randomize(a, i);
  704. Matrix::randomize(b, i);
  705.  
  706. StartTimer();
  707. Matrix::mul(a, b, c, i);
  708. EndTimer(fr, &sec);
  709. multime = sec < multime ? sec : multime;
  710.  
  711.  
  712. StartTimer();
  713. Matrix::SSEmul(a, b, d, i);
  714. EndTimer(fr, &sec);
  715. ssetime = sec < ssetime ? sec : ssetime;
  716.  
  717. if (!Matrix::cmp(c, d, i, eps)) {
  718. _tprintf(_T("\n FAILED TO MUL MATRICES"));
  719. }
  720. }
  721. _tprintf(_T("\n mul [%d]:\t%f"), i, multime);
  722. _tprintf(_T("\n sse [%d]:\t%f\n"), i, ssetime);
  723. }
  724. for (size_t i = 512; i <= 2048; i <<= 1) {
  725.  
  726. float* a = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  727. float* b = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  728. float* c = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  729. float* d = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  730. Matrix::randomize(a, i);
  731. Matrix::randomize(b, i);
  732.  
  733. StartTimer();
  734. Matrix::mul(a, b, c, i);
  735. EndTimer(fr, &sec);
  736. _tprintf(_T("\n mul [%d]:\t%f"), i, sec);
  737.  
  738.  
  739. StartTimer();
  740. Matrix::SSEmul(a, b, d, i);
  741. EndTimer(fr, &sec);
  742. _tprintf(_T("\n sse [%d]:\t%f\n"), i, sec);
  743.  
  744. if (!Matrix::cmp(c, d, i, eps)) {
  745. _tprintf(_T("\n FAILED TO MUL MATRICES"));
  746. }
  747. }
  748. }
  749.  
  750. static void TestSSEstrassen(LONGLONG fr) {
  751. for (size_t i = 32, k = 30; i <= 256; i <<= 1) {
  752. double multime = 0xFFFFFFFF;
  753. double ssetime = 0xFFFFFFFF;
  754.  
  755. float* a = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  756. float* b = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  757. float* c = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  758. float* d = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  759. for (size_t j = 0; j < k; j++) {
  760. Matrix::randomize(a, i);
  761. Matrix::randomize(b, i);
  762.  
  763. StartTimer();
  764. Matrix::strassen(a, b, c, i);
  765. EndTimer(fr, &sec);
  766. multime = sec < multime ? sec : multime;
  767.  
  768.  
  769. StartTimer();
  770. Matrix::SSEstrassen(a, b, d, i);
  771. EndTimer(fr, &sec);
  772. ssetime = sec < ssetime ? sec : ssetime;
  773.  
  774. if (!Matrix::cmp(c, d, i, eps)) {
  775. _tprintf(_T("\n FAILED TO MUL MATRICES"));
  776. }
  777. }
  778. _tprintf(_T("\n str [%d]:\t%f"), i, multime);
  779. _tprintf(_T("\n sse [%d]:\t%f\n"), i, ssetime);
  780. }
  781. for (size_t i = 512; i <= 2048; i <<= 1) {
  782.  
  783. float* a = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  784. float* b = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  785. float* c = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  786. float* d = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  787. Matrix::randomize(a, i);
  788. Matrix::randomize(b, i);
  789.  
  790. StartTimer();
  791. Matrix::strassen(a, b, c, i);
  792. EndTimer(fr, &sec);
  793. _tprintf(_T("\n str [%d]:\t%f"), i, sec);
  794.  
  795.  
  796. StartTimer();
  797. Matrix::SSEstrassen(a, b, d, i);
  798. EndTimer(fr, &sec);
  799. _tprintf(_T("\n sse [%d]:\t%f\n"), i, sec);
  800.  
  801. if (!Matrix::cmp(c, d, i, eps)) {
  802. _tprintf(_T("\n FAILED TO MUL MATRICES"));
  803. }
  804. }
  805. }
  806.  
  807. static void TestSSEdiv(LONGLONG fr) {
  808. for (size_t i = 32; i <= 2048; i <<= 1) {
  809.  
  810. float* a = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  811. float* b = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  812. float* c = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  813. float* d = (float*)_aligned_malloc(sizeof(float) * i * i, 16);
  814. Matrix::randomize(a, i);
  815. Matrix::randomize(b, i);
  816.  
  817. StartTimer();
  818. Matrix::divStrassen(a, b, c, i);
  819. EndTimer(fr, &sec);
  820. _tprintf(_T("\n div [%d]:\t%f"), i, sec);
  821.  
  822.  
  823. StartTimer();
  824. Matrix::SSEdiv(a, b, d, i);
  825. EndTimer(fr, &sec);
  826. _tprintf(_T("\n sse [%d]:\t%f\n"), i, sec);
  827.  
  828. if (!Matrix::cmp(c, d, i, eps)) {
  829. _tprintf(_T("\n FAILED TO DIV MATRICES"));
  830. }
  831. }
  832. }
  833.  
  834. static void TestDiv(LONGLONG fr) {
  835. for (size_t i = 32; i <= 2048; i <<= 1) {
  836. uint64 multime = 0xFFFFFFFF;
  837.  
  838. float* a = new float[i * i];
  839. float* b = new float[i * i];
  840. float* c = new float[i * i];
  841.  
  842. Matrix::randomize(a, i);
  843. Matrix::randomize(b, i);
  844.  
  845. StartTimer();
  846. Matrix::div(a, b, c, i);
  847. EndTimer(fr, &sec);
  848. _tprintf(_T("\n div [%d]:\t%f"), i, sec);
  849. }
  850. }
  851.  
  852. static void Test(LONGLONG fr) {
  853. /*_tprintf(_T("TestAddSub(fr)\n"));
  854. TestAddSub(fr);
  855. _tprintf(_T("\n\nTestMulStr(fr)\n"));
  856. TestMulStr(fr);
  857. _tprintf(_T("\n\nTestSSEadd(fr)\n"));
  858. TestSSEadd(fr);
  859. _tprintf(_T("\n\nTestSSEsub(fr)\n"));
  860. TestSSEsub(fr);
  861. _tprintf(_T("\n\nTestSSEmul(fr)\n"));
  862. TestSSEmul(fr);
  863. _tprintf(_T("\n\nTestSSEstrassen(fr)\n"));
  864. TestSSEstrassen(fr);*/
  865. _tprintf(_T("\n\nTestSSEdiv(fr)\n"));
  866. TestSSEdiv(fr);
  867. /*_tprintf(_T("\n\nTestDiv(fr)\n"));
  868. TestDiv(fr);*/
  869. }
  870.  
  871. int main() {
  872. srand((unsigned int)time(0));
  873.  
  874. QPF(&frequency);
  875. LONGLONG fr = frequency.QuadPart;
  876.  
  877. Test(fr);
  878.  
  879. _getch();
  880. return 0;
  881. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement