Advertisement
Guest User

Untitled

a guest
Nov 22nd, 2019
130
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 23.81 KB | None | 0 0
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <sys/time.h>
  4. #include <time.h>
  5. #include <papi.h>
  6. #include <time.h>
  7.  
  8. #include <stdio.h>
  9. #include <stdlib.h>
  10.  
  11. #include <mmintrin.h>
  12. #include <xmmintrin.h> // SSE
  13. #include <pmmintrin.h> // SSE2
  14. #include <emmintrin.h> // SSE3
  15.  
  16. #define T double
  17.  
  18.  
  19. T* make_matrix(int n)
  20. {
  21. T* mat = (T*)malloc(sizeof(T)*n*n);
  22. for (int i = 0; i < n*n; ++i)
  23. {
  24. mat[i] = rand() / RAND_MAX;
  25. }
  26. return mat;
  27. }
  28.  
  29. void destroy_matrix(T* m)
  30. {
  31. free(m);
  32. }
  33.  
  34. void mm0(T* lhs, T* rhs, T* result, int n)
  35. {
  36. for (int i = 0; i < n; ++i)
  37. {
  38. for (int j = 0; j < n; ++j)
  39. {
  40. T sum = 0;
  41. for (int k = 0; k < n; ++k)
  42. {
  43. sum += lhs[i*n+k] * rhs[k*n+j];
  44. }
  45. result[i*n+j] = sum;
  46. }
  47. }
  48. }
  49.  
  50. void mm1(T* lhs, T* rhs, T* result, int n)
  51. {
  52. for (int i = n; --i;)
  53. {
  54. for (int j = n; --j;)
  55. {
  56. T sum = 0;
  57. for (int k = n; --k;)
  58. {
  59. sum += lhs[i*n+k] * rhs[k*n+j];
  60. }
  61. result[i*n+j] = sum;
  62. }
  63. }
  64. }
  65.  
  66. void mm2(T* lhs, T* rhs, T* result, int n)
  67. {
  68. for (int i = n; --i;)
  69. {
  70. for (int j = n; --j;)
  71. {
  72. T sum = 0;
  73. int k = 0;
  74. for (; k < n - 7; k += 8)
  75. {
  76. sum += lhs[i*n+k] * rhs[k*n+j];
  77. sum += lhs[i*n+k+1] * rhs[(k+1)*n+j];
  78. sum += lhs[i*n+k+2] * rhs[(k+2)*n+j];
  79. sum += lhs[i*n+k+3] * rhs[(k+3)*n+j];
  80. sum += lhs[i*n+k+4] * rhs[(k+4)*n+j];
  81. sum += lhs[i*n+k+5] * rhs[(k+5)*n+j];
  82. sum += lhs[i*n+k+6] * rhs[(k+6)*n+j];
  83. sum += lhs[i*n+k+7] * rhs[(k+7)*n+j];
  84. }
  85. for (; k < n; ++k)
  86. {
  87. sum += lhs[i*n+k] * rhs[k*n+j];
  88. }
  89. result[i*n+j] = sum;
  90. }
  91. }
  92. }
  93.  
  94. void mm3(T* lhs, T* rhs, T* result, int n)
  95. {
  96. for (int i = n; --i;)
  97. {
  98. for (int j = n; --j;)
  99. {
  100. T sum = 0;
  101. int k = 0;
  102. for (; k < n - 15; k += 16)
  103. {
  104. sum += lhs[i*n+k] * rhs[k*n+j];
  105. sum += lhs[i*n+k+1] * rhs[(k+1)*n+j];
  106. sum += lhs[i*n+k+2] * rhs[(k+2)*n+j];
  107. sum += lhs[i*n+k+3] * rhs[(k+3)*n+j];
  108. sum += lhs[i*n+k+4] * rhs[(k+4)*n+j];
  109. sum += lhs[i*n+k+5] * rhs[(k+5)*n+j];
  110. sum += lhs[i*n+k+6] * rhs[(k+6)*n+j];
  111. sum += lhs[i*n+k+7] * rhs[(k+7)*n+j];
  112. sum += lhs[i*n+k+8] * rhs[(k+8)*n+j];
  113. sum += lhs[i*n+k+9] * rhs[(k+9)*n+j];
  114. sum += lhs[i*n+k+10] * rhs[(k+10)*n+j];
  115. sum += lhs[i*n+k+11] * rhs[(k+11)*n+j];
  116. sum += lhs[i*n+k+12] * rhs[(k+12)*n+j];
  117. sum += lhs[i*n+k+13] * rhs[(k+13)*n+j];
  118. sum += lhs[i*n+k+14] * rhs[(k+14)*n+j];
  119. sum += lhs[i*n+k+15] * rhs[(k+15)*n+j];
  120. }
  121. for (; k < n; ++k)
  122. {
  123. sum += lhs[i*n+k] * rhs[k*n+j];
  124. }
  125. result[i*n+j] = sum;
  126. }
  127. }
  128. }
  129.  
  130. void mm4(T* lhs, T* rhs, T* result, int n)
  131. {
  132. for (int i = n; --i;)
  133. {
  134. for (int j = n; --j;)
  135. {
  136. T sum = 0;
  137. int k = 0;
  138. for (; k < n - 31; k += 32)
  139. {
  140. sum += lhs[i*n+k] * rhs[k*n+j];
  141. sum += lhs[i*n+k+1] * rhs[(k+1)*n+j];
  142. sum += lhs[i*n+k+2] * rhs[(k+2)*n+j];
  143. sum += lhs[i*n+k+3] * rhs[(k+3)*n+j];
  144. sum += lhs[i*n+k+4] * rhs[(k+4)*n+j];
  145. sum += lhs[i*n+k+5] * rhs[(k+5)*n+j];
  146. sum += lhs[i*n+k+6] * rhs[(k+6)*n+j];
  147. sum += lhs[i*n+k+7] * rhs[(k+7)*n+j];
  148. sum += lhs[i*n+k+8] * rhs[(k+8)*n+j];
  149. sum += lhs[i*n+k+9] * rhs[(k+9)*n+j];
  150. sum += lhs[i*n+k+10] * rhs[(k+10)*n+j];
  151. sum += lhs[i*n+k+11] * rhs[(k+11)*n+j];
  152. sum += lhs[i*n+k+12] * rhs[(k+12)*n+j];
  153. sum += lhs[i*n+k+13] * rhs[(k+13)*n+j];
  154. sum += lhs[i*n+k+14] * rhs[(k+14)*n+j];
  155. sum += lhs[i*n+k+15] * rhs[(k+15)*n+j];
  156. sum += lhs[i*n+k+16] * rhs[(k+16)*n+j];
  157. sum += lhs[i*n+k+17] * rhs[(k+17)*n+j];
  158. sum += lhs[i*n+k+18] * rhs[(k+18)*n+j];
  159. sum += lhs[i*n+k+19] * rhs[(k+19)*n+j];
  160. sum += lhs[i*n+k+20] * rhs[(k+20)*n+j];
  161. sum += lhs[i*n+k+21] * rhs[(k+21)*n+j];
  162. sum += lhs[i*n+k+22] * rhs[(k+22)*n+j];
  163. sum += lhs[i*n+k+23] * rhs[(k+23)*n+j];
  164. sum += lhs[i*n+k+24] * rhs[(k+24)*n+j];
  165. sum += lhs[i*n+k+25] * rhs[(k+25)*n+j];
  166. sum += lhs[i*n+k+26] * rhs[(k+26)*n+j];
  167. sum += lhs[i*n+k+27] * rhs[(k+27)*n+j];
  168. sum += lhs[i*n+k+28] * rhs[(k+28)*n+j];
  169. sum += lhs[i*n+k+29] * rhs[(k+29)*n+j];
  170. sum += lhs[i*n+k+30] * rhs[(k+30)*n+j];
  171. sum += lhs[i*n+k+31] * rhs[(k+31)*n+j];
  172. }
  173. for (; k < n; ++k)
  174. {
  175. sum += lhs[i*n+k] * rhs[k*n+j];
  176. }
  177. result[i*n+j] = sum;
  178. }
  179. }
  180. }
  181.  
  182. void transpose(T* from, T* to, int n)
  183. {
  184. for (int i = 0; i < n; ++i)
  185. {
  186. for (int j = 0; j < n; ++j)
  187. {
  188. to[i*n+j] = from[i+j*n];
  189. }
  190. }
  191. }
  192.  
  193. void mm5(T* lhs, T* rhs, T* result, int n)
  194. {
  195. T* rhs_t = make_matrix(n);
  196. transpose(rhs, rhs_t, n);
  197.  
  198. for (int i = n; --i;)
  199. {
  200. for (int j = n; --j;)
  201. {
  202. T sum = 0;
  203. int k = 0;
  204. for (; k < n - 31; k += 32)
  205. {
  206. sum += lhs[i*n+k] * rhs_t[k+j*n];
  207. sum += lhs[i*n+k+1] * rhs_t[(k+1)+j*n];
  208. sum += lhs[i*n+k+2] * rhs_t[(k+2)+j*n];
  209. sum += lhs[i*n+k+3] * rhs_t[(k+3)+j*n];
  210. sum += lhs[i*n+k+4] * rhs_t[(k+4)+j*n];
  211. sum += lhs[i*n+k+5] * rhs_t[(k+5)+j*n];
  212. sum += lhs[i*n+k+6] * rhs_t[(k+6)+j*n];
  213. sum += lhs[i*n+k+7] * rhs_t[(k+7)+j*n];
  214. sum += lhs[i*n+k+8] * rhs_t[(k+8)+j*n];
  215. sum += lhs[i*n+k+9] * rhs_t[(k+9)+j*n];
  216. sum += lhs[i*n+k+10] * rhs_t[(k+10)+j*n];
  217. sum += lhs[i*n+k+11] * rhs_t[(k+11)+j*n];
  218. sum += lhs[i*n+k+12] * rhs_t[(k+12)+j*n];
  219. sum += lhs[i*n+k+13] * rhs_t[(k+13)+j*n];
  220. sum += lhs[i*n+k+14] * rhs_t[(k+14)+j*n];
  221. sum += lhs[i*n+k+15] * rhs_t[(k+15)+j*n];
  222. sum += lhs[i*n+k+16] * rhs_t[(k+16)+j*n];
  223. sum += lhs[i*n+k+17] * rhs_t[(k+17)+j*n];
  224. sum += lhs[i*n+k+18] * rhs_t[(k+18)+j*n];
  225. sum += lhs[i*n+k+19] * rhs_t[(k+19)+j*n];
  226. sum += lhs[i*n+k+20] * rhs_t[(k+20)+j*n];
  227. sum += lhs[i*n+k+21] * rhs_t[(k+21)+j*n];
  228. sum += lhs[i*n+k+22] * rhs_t[(k+22)+j*n];
  229. sum += lhs[i*n+k+23] * rhs_t[(k+23)+j*n];
  230. sum += lhs[i*n+k+24] * rhs_t[(k+24)+j*n];
  231. sum += lhs[i*n+k+25] * rhs_t[(k+25)+j*n];
  232. sum += lhs[i*n+k+26] * rhs_t[(k+26)+j*n];
  233. sum += lhs[i*n+k+27] * rhs_t[(k+27)+j*n];
  234. sum += lhs[i*n+k+28] * rhs_t[(k+28)+j*n];
  235. sum += lhs[i*n+k+29] * rhs_t[(k+29)+j*n];
  236. sum += lhs[i*n+k+30] * rhs_t[(k+30)+j*n];
  237. sum += lhs[i*n+k+31] * rhs_t[(k+31)+j*n];
  238. }
  239. for (; k < n; ++k)
  240. {
  241. sum += lhs[i*n+k] * rhs_t[k+j*n];
  242. }
  243. result[i*n+j] = sum;
  244. }
  245. }
  246.  
  247. destroy_matrix(rhs_t);
  248. }
  249.  
  250. void mm6(T* lhs, T* rhs, T* result, int n)
  251. {
  252. T* rhs_t = make_matrix(n);
  253. transpose(rhs, rhs_t, n);
  254.  
  255. for (int i = n; --i;)
  256. {
  257. for (int j = n; --j;)
  258. {
  259. T sum0 = 0;
  260. T sum1 = 0;
  261. T sum2 = 0;
  262. T sum3 = 0;
  263. int k = 0;
  264. for (; k < n - 7; k += 8)
  265. {
  266. sum0 += lhs[i*n+k] * rhs_t[k+j*n];
  267. sum1 += lhs[i*n+k+1] * rhs_t[(k+1)+j*n];
  268. sum2 += lhs[i*n+k+2] * rhs_t[(k+2)+j*n];
  269. sum3 += lhs[i*n+k+3] * rhs_t[(k+3)+j*n];
  270. sum0 += lhs[i*n+k+4] * rhs_t[(k+4)+j*n];
  271. sum1 += lhs[i*n+k+5] * rhs_t[(k+5)+j*n];
  272. sum2 += lhs[i*n+k+6] * rhs_t[(k+6)+j*n];
  273. sum3 += lhs[i*n+k+7] * rhs_t[(k+7)+j*n];
  274. }
  275. for (; k < n; ++k)
  276. {
  277. sum0 += lhs[i*n+k] * rhs_t[k+j*n];
  278. }
  279. result[i*n+j] = sum0 + sum1 + sum2 + sum3;
  280. }
  281. }
  282.  
  283. destroy_matrix(rhs_t);
  284. }
  285.  
  286. void mm7(T* lhs, T* rhs, T* result, int n)
  287. {
  288. T* rhs_t = make_matrix(n);
  289. transpose(rhs, rhs_t, n);
  290.  
  291. for (int i = 0; i < n; ++i)
  292. {
  293. int j = 0;
  294. for (; j < n - 7; j += 4)
  295. {
  296. T sum0 = 0;
  297. T sum1 = 0;
  298. T sum2 = 0;
  299. T sum3 = 0;
  300. int k = 0;
  301. for (; k < n - 7; k += 8)
  302. {
  303. const T a0 = lhs[i*n+k+0];
  304. const T a1 = lhs[i*n+k+1];
  305. const T a2 = lhs[i*n+k+2];
  306. const T a3 = lhs[i*n+k+3];
  307. const T a4 = lhs[i*n+k+4];
  308. const T a5 = lhs[i*n+k+5];
  309. const T a6 = lhs[i*n+k+6];
  310. const T a7 = lhs[i*n+k+7];
  311.  
  312. sum0 += a0 * rhs_t[(k+0)+j*n];
  313. sum1 += a0 * rhs_t[(k+0)+(j+1)*n];
  314. sum2 += a0 * rhs_t[(k+0)+(j+2)*n];
  315. sum3 += a0 * rhs_t[(k+0)+(j+3)*n];
  316.  
  317. sum0 += a1 * rhs_t[(k+1)+j*n];
  318. sum1 += a1 * rhs_t[(k+1)+(j+1)*n];
  319. sum2 += a1 * rhs_t[(k+1)+(j+2)*n];
  320. sum3 += a1 * rhs_t[(k+1)+(j+3)*n];
  321.  
  322. sum0 += a2 * rhs_t[(k+2)+j*n];
  323. sum1 += a2 * rhs_t[(k+2)+(j+1)*n];
  324. sum2 += a2 * rhs_t[(k+2)+(j+2)*n];
  325. sum3 += a2 * rhs_t[(k+2)+(j+3)*n];
  326.  
  327. sum0 += a3 * rhs_t[(k+3)+j*n];
  328. sum1 += a3 * rhs_t[(k+3)+(j+1)*n];
  329. sum2 += a3 * rhs_t[(k+3)+(j+2)*n];
  330. sum3 += a3 * rhs_t[(k+3)+(j+3)*n];
  331.  
  332. sum0 += a4 * rhs_t[(k+4)+j*n];
  333. sum1 += a4 * rhs_t[(k+4)+(j+1)*n];
  334. sum2 += a4 * rhs_t[(k+4)+(j+2)*n];
  335. sum3 += a4 * rhs_t[(k+4)+(j+3)*n];
  336.  
  337. sum0 += a5 * rhs_t[(k+5)+j*n];
  338. sum1 += a5 * rhs_t[(k+5)+(j+1)*n];
  339. sum2 += a5 * rhs_t[(k+5)+(j+2)*n];
  340. sum3 += a5 * rhs_t[(k+5)+(j+3)*n];
  341.  
  342. sum0 += a6 * rhs_t[(k+6)+j*n];
  343. sum1 += a6 * rhs_t[(k+6)+(j+1)*n];
  344. sum2 += a6 * rhs_t[(k+6)+(j+2)*n];
  345. sum3 += a6 * rhs_t[(k+6)+(j+3)*n];
  346.  
  347. sum0 += a7 * rhs_t[(k+7)+j*n];
  348. sum1 += a7 * rhs_t[(k+7)+(j+1)*n];
  349. sum2 += a7 * rhs_t[(k+7)+(j+2)*n];
  350. sum3 += a7 * rhs_t[(k+7)+(j+3)*n];
  351. }
  352. result[i*n+j] = sum0;
  353. result[i*n+j+1] = sum1;
  354. result[i*n+j+2] = sum2;
  355. result[i*n+j+3] = sum3;
  356. }
  357. }
  358.  
  359. destroy_matrix(rhs_t);
  360. }
  361.  
  362. void mm8_d(double* lhs, double* rhs, double* result, int n)
  363. {
  364. double* rhs_t = make_matrix(n);
  365. transpose(rhs, rhs_t, n);
  366.  
  367. for (int i = 0; i < n; ++i)
  368. {
  369. int j = 0;
  370. for (; j < n - 7; j += 8)
  371. {
  372. __m128d sum0 = _mm_setzero_pd();
  373. __m128d sum1 = _mm_setzero_pd();
  374. __m128d sum2 = _mm_setzero_pd();
  375. __m128d sum3 = _mm_setzero_pd();
  376. __m128d sum4 = _mm_setzero_pd();
  377. __m128d sum5 = _mm_setzero_pd();
  378. __m128d sum6 = _mm_setzero_pd();
  379. __m128d sum7 = _mm_setzero_pd();
  380. int k = 0;
  381. for (; k < n - 7; k += 8)
  382. {
  383. const __m128d a01 = _mm_load_pd(&lhs[i*n+k+0]);
  384. const __m128d a23 = _mm_load_pd(&lhs[i*n+k+2]);
  385. const __m128d a45 = _mm_load_pd(&lhs[i*n+k+4]);
  386. const __m128d a67 = _mm_load_pd(&lhs[i*n+k+6]);
  387.  
  388. sum0 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+j*n]));
  389. sum0 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+j*n]));
  390. sum0 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+j*n]));
  391. sum0 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+j*n]));
  392.  
  393. sum1 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+(j+1)*n]));
  394. sum1 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+(j+1)*n]));
  395. sum1 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+(j+1)*n]));
  396. sum1 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+(j+1)*n]));
  397.  
  398. sum2 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+(j+2)*n]));
  399. sum2 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+(j+2)*n]));
  400. sum2 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+(j+2)*n]));
  401. sum2 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+(j+2)*n]));
  402.  
  403. sum3 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+(j+3)*n]));
  404. sum3 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+(j+3)*n]));
  405. sum3 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+(j+3)*n]));
  406. sum3 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+(j+3)*n]));
  407.  
  408. sum4 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+(j+4)*n]));
  409. sum4 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+(j+4)*n]));
  410. sum4 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+(j+4)*n]));
  411. sum4 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+(j+4)*n]));
  412.  
  413. sum5 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+(j+5)*n]));
  414. sum5 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+(j+5)*n]));
  415. sum5 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+(j+5)*n]));
  416. sum1 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+(j+5)*n]));
  417.  
  418. sum6 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+(j+6)*n]));
  419. sum6 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+(j+6)*n]));
  420. sum6 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+(j+6)*n]));
  421. sum6 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+(j+6)*n]));
  422.  
  423. sum7 += _mm_mul_pd(a01, _mm_load_pd(&rhs_t[(k+0)+(j+7)*n]));
  424. sum7 += _mm_mul_pd(a23, _mm_load_pd(&rhs_t[(k+2)+(j+7)*n]));
  425. sum7 += _mm_mul_pd(a45, _mm_load_pd(&rhs_t[(k+4)+(j+7)*n]));
  426. sum7 += _mm_mul_pd(a67, _mm_load_pd(&rhs_t[(k+6)+(j+7)*n]));
  427. }
  428. _mm_store_pd(&result[i*n+j], _mm_hadd_pd(sum0, sum1));
  429. _mm_store_pd(&result[i*n+j+2], _mm_hadd_pd(sum2, sum3));
  430. _mm_store_pd(&result[i*n+j+4], _mm_hadd_pd(sum4, sum5));
  431. _mm_store_pd(&result[i*n+j+6], _mm_hadd_pd(sum6, sum7));
  432. }
  433. }
  434.  
  435. destroy_matrix(rhs_t);
  436. }
  437.  
  438.  
  439. void mm8_f(float* lhs, float* rhs, float* result, int n)
  440. {
  441. float* rhs_t = make_matrix(n);
  442. transpose(rhs, rhs_t, n);
  443.  
  444. for (int i = 0; i < n; ++i)
  445. {
  446. int j = 0;
  447. for (; j < n - 7; j += 8)
  448. {
  449. __m128 sum0 = _mm_setzero_ps();
  450. __m128 sum1 = _mm_setzero_ps();
  451. __m128 sum2 = _mm_setzero_ps();
  452. __m128 sum3 = _mm_setzero_ps();
  453. __m128 sum4 = _mm_setzero_ps();
  454. __m128 sum5 = _mm_setzero_ps();
  455. __m128 sum6 = _mm_setzero_ps();
  456. __m128 sum7 = _mm_setzero_ps();
  457. int k = 0;
  458. for (; k < n - 15; k += 16)
  459. {
  460. const __m128 a0123 = _mm_load_ps(&lhs[i*n+k+0]);
  461. const __m128 a4567 = _mm_load_ps(&lhs[i*n+k+4]);
  462. const __m128 b0123 = _mm_load_ps(&lhs[i*n+k+8]);
  463. const __m128 b4567 = _mm_load_ps(&lhs[i*n+k+12]);
  464.  
  465. sum0 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+j*n]));
  466. sum0 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+j*n]));
  467. sum0 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+j*n]));
  468. sum0 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+j*n]));
  469.  
  470. sum1 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+(j+1)*n]));
  471. sum1 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+(j+1)*n]));
  472. sum1 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+(j+1)*n]));
  473. sum1 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+(j+1)*n]));
  474.  
  475. sum2 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+(j+2)*n]));
  476. sum2 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+(j+2)*n]));
  477. sum3 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+(j+2)*n]));
  478. sum3 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+(j+2)*n]));
  479.  
  480. sum3 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+(j+3)*n]));
  481. sum3 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+(j+3)*n]));
  482. sum4 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+(j+3)*n]));
  483. sum4 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+(j+3)*n]));
  484.  
  485. sum4 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+(j+4)*n]));
  486. sum4 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+(j+4)*n]));
  487. sum4 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+(j+4)*n]));
  488. sum4 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+(j+4)*n]));
  489.  
  490. sum5 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+(j+5)*n]));
  491. sum5 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+(j+5)*n]));
  492. sum5 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+(j+5)*n]));
  493. sum5 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+(j+5)*n]));
  494.  
  495. sum6 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+(j+6)*n]));
  496. sum6 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+(j+6)*n]));
  497. sum6 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+(j+6)*n]));
  498. sum6 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+(j+6)*n]));
  499.  
  500. sum7 += _mm_mul_ps(a0123, _mm_load_ps(&rhs_t[(k+0)+(j+7)*n]));
  501. sum7 += _mm_mul_ps(a4567, _mm_load_ps(&rhs_t[(k+4)+(j+7)*n]));
  502. sum7 += _mm_mul_ps(b0123, _mm_load_ps(&rhs_t[(k+8)+(j+7)*n]));
  503. sum7 += _mm_mul_ps(b4567, _mm_load_ps(&rhs_t[(k+12)+(j+7)*n]));
  504. }
  505. __m128 t0 = _mm_hadd_ps(sum0, sum1);
  506. __m128 t1 = _mm_hadd_ps(sum2, sum3);
  507. __m128 t2 = _mm_hadd_ps(sum4, sum5);
  508. __m128 t3 = _mm_hadd_ps(sum6, sum7);
  509. _mm_store_ps(&result[i*n+j], _mm_hadd_ps(t0, t1));
  510. _mm_store_ps(&result[i*n+j+4], _mm_hadd_ps(t2, t3));
  511. }
  512. }
  513.  
  514. destroy_matrix(rhs_t);
  515. }
  516.  
  517. #define SIZE 512
  518.  
  519. static double gtod_ref_time_sec = 0.0;
  520.  
  521. /* Adapted from the bl2_clock() routine in the BLIS library */
  522. double dclock() {
  523. double the_time, norm_sec;
  524. struct timeval tv;
  525. gettimeofday(&tv, NULL);
  526. if (gtod_ref_time_sec == 0.0)
  527. gtod_ref_time_sec = (double) tv.tv_sec;
  528. norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
  529. the_time = norm_sec + tv.tv_usec * 1.0e-6;
  530. return the_time;
  531. }
  532.  
  533. /* Mulitplication of two matrices */
  534. int mm(double first[][SIZE], double second[][SIZE], double multiply[][SIZE]) {
  535. int i, j, k;
  536. double sum = 0;
  537. for (i = 0; i < SIZE; i++) { //rows in multiply
  538. for (j = 0; j < SIZE; j++) { //columns in multiply
  539. for (k = 0; k < SIZE; k++) { //columns in first and rows in second
  540. sum = sum + first[i][k] * second[k][j];
  541. }
  542. multiply[i][j] = sum;
  543. sum = 0;
  544. }
  545. }
  546. return 0;
  547. }
  548.  
  549. float time_it(void (*f)(T*, T*, T*, int), int n, float min_t)
  550. {
  551. T* lhs = make_matrix(n);
  552. T* rhs = make_matrix(n);
  553. T* result = make_matrix(n);
  554.  
  555. clock_t start = clock();
  556. for(int i = 1;; ++i)
  557. {
  558. f(lhs, rhs, result, n);
  559.  
  560. clock_t end = clock();
  561. float seconds = (float)(end - start) / CLOCKS_PER_SEC;
  562. if (seconds >= min_t)
  563. {
  564. return seconds / i;
  565. }
  566. }
  567.  
  568. destroy_matrix(lhs);
  569. destroy_matrix(rhs);
  570. destroy_matrix(result);
  571. }
  572.  
  573. int main(int argc, const char* argv[]) {
  574. int i, j, iret;
  575. double first[SIZE][SIZE];
  576. double second[SIZE][SIZE];
  577. double multiply[SIZE][SIZE];
  578. double dtime;
  579.  
  580. /* PAPI variables */
  581. int measure;
  582. int * events;
  583. long long * values;
  584. int numevents;
  585. int retval;
  586. int check;
  587. float real_time;
  588. float proc_time;
  589. float mflops;
  590. long long flpins;
  591. char * errorstring;
  592.  
  593.  
  594. /* Initalize matrices */
  595. for (i = 0; i < SIZE; i++) { //rows in first
  596. for (j = 0; j < SIZE; j++) { //columns in first
  597. first[i][j] = i + j;
  598. second[i][j] = i - j;
  599. multiply[i][j] = 0.0;
  600. }
  601. }
  602.  
  603.  
  604. measure = 0;
  605. if (argc > 1) {
  606. measure = atoi(argv[1]);
  607. }
  608.  
  609. /* Start PAPI counters*/
  610. if (measure == 1) {
  611. retval = PAPI_flops(&real_time, &proc_time, &flpins, &mflops);
  612. if (retval != PAPI_OK) {
  613. errorstring = PAPI_strerror(retval);
  614. fprintf(stderr, errorstring);
  615. fprintf(stderr, "\n");
  616. free(errorstring);
  617. exit(1);
  618. }
  619. printf("PAPI started\n");
  620. }
  621. if (measure == 2) {
  622. numevents = 2;
  623. events = malloc(sizeof *events * numevents);
  624. events[0] = PAPI_L1_ICM;
  625. events[1] = PAPI_FP_OPS;
  626. PAPI_library_init(check);
  627. retval = PAPI_start_counters(events, numevents);
  628. if (retval != PAPI_OK) {
  629. errorstring = PAPI_strerror(retval);
  630. fprintf(stderr, errorstring);
  631. fprintf(stderr, "\n");
  632. free(errorstring);
  633. exit(1);
  634. }
  635. printf("PAPI started\n");
  636. }
  637.  
  638. /* Measure execution time */
  639. if (measure == 0) {
  640. dtime = dclock();
  641. }
  642.  
  643. /* Here is call to matrix multiplication functionality */
  644. int n = 1024;
  645. float min_t = 0.0f;
  646. time_it(mm0, n, min_t)
  647. //iret = mm(first, second, multiply);
  648.  
  649. /* Measure execution time */
  650. if (measure == 0) {
  651. dtime = dclock() - dtime;
  652. printf("Time: %le \n", dtime);
  653. }
  654.  
  655. /* Here is PAPI reading and printout */
  656. if (measure == 1) {
  657. retval = PAPI_flops(&real_time, &proc_time, &flpins, &mflops);
  658. if (retval != PAPI_OK) {
  659. errorstring = PAPI_strerror(retval);
  660. fprintf(stderr, errorstring);
  661. fprintf(stderr, "\n");
  662. free(errorstring);
  663. exit(1);
  664. }
  665. printf("Real_time: %f Proc_time: %f Total flpops: %lld MFLOPS: %f\n", real_time, proc_time, flpins, mflops);
  666. }
  667. if (measure == 2) {
  668. values = malloc(sizeof *values * numevents);
  669. retval = PAPI_stop_counters(values, numevents);
  670. if (retval != PAPI_OK) {
  671. errorstring = PAPI_strerror(retval);
  672. fprintf(stderr, errorstring);
  673. fprintf(stderr, "\n");
  674. free(errorstring);
  675. exit(1);
  676. }
  677. printf("Mem loads: %lld Mem store: %lld\n", values[0], values[1]);
  678. free(values);
  679. free(events);
  680. }
  681.  
  682. /* Checking part of the code. Proper value is 2.932020e+12 */
  683. double mcheck = 0.0;
  684. for (i = 0; i < SIZE; i++) {
  685. for (j = 0; j < SIZE; j++) {
  686. mcheck += multiply[i][j];
  687. }
  688. }
  689. printf("check %le \n", mcheck);
  690. fflush(stdout);
  691.  
  692. return iret;
  693. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement