Advertisement
Guest User

Untitled

a guest
Jul 28th, 2015
227
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.12 KB | None | 0 0
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <string.h>
  4. #include <sys/time.h>
  5.  
  6. #define SIZE_KB {8, 16, 24, 28, 32, 36, 40, 48, 64, 128, 256, 384, 512, 768, 1024, 1025, 2048, 4096, 8192, 16384, 200000}
  7. #define TESTMEM 10000000000 // Approximate, in bytes
  8. #define BUFFERS 1
  9.  
  10. double timer(void)
  11. {
  12. struct timeval ts;
  13. double ans;
  14.  
  15. gettimeofday(&ts, NULL);
  16. ans = ts.tv_sec + ts.tv_usec*1.0e-6;
  17.  
  18. return ans;
  19. }
  20.  
  21. int main(int argc, char **argv)
  22. {
  23. double *x[BUFFERS];
  24. double t1, t2;
  25. int kbsizes[] = SIZE_KB;
  26. double bandwidth[sizeof(kbsizes)/sizeof(int)];
  27. int iterations[sizeof(kbsizes)/sizeof(int)];
  28. double *address[sizeof(kbsizes)/sizeof(int)][BUFFERS];
  29. int i, j, k;
  30.  
  31. for (k = 0; k < sizeof(kbsizes)/sizeof(int); k++)
  32. iterations[k] = TESTMEM/(kbsizes[k]*1024);
  33.  
  34. for (k = 0; k < sizeof(kbsizes)/sizeof(int); k++)
  35. {
  36. // Allocate
  37. for (j = 0; j < BUFFERS; j++)
  38. {
  39. x[j] = (double *) malloc(kbsizes[k]*1024);
  40. address[k][j] = x[j];
  41. memset(x[j], 0, kbsizes[k]*1024);
  42. }
  43.  
  44. // Measure
  45. t1 = timer();
  46. for (i = 0; i < iterations[k]; i++)
  47. {
  48. for (j = 0; j < BUFFERS; j++)
  49. memset(x[j], 0xff, kbsizes[k]*1024);
  50. }
  51. t2 = timer();
  52. bandwidth[k] = (BUFFERS*kbsizes[k]*iterations[k])/1024.0/1024.0/(t2-t1);
  53.  
  54. // Free
  55. for (j = 0; j < BUFFERS; j++)
  56. free(x[j]);
  57. }
  58.  
  59. printf("TESTMEM = %ldn", TESTMEM);
  60. printf("BUFFERS = %dn", BUFFERS);
  61. printf("Size (kB)tBandwidth (GB/s)tIterationstAddressesn");
  62. for (k = 0; k < sizeof(kbsizes)/sizeof(int); k++)
  63. {
  64. printf("%7dtt%.2fttt%dtt%x", kbsizes[k], bandwidth[k], iterations[k], address[k][0]);
  65. for (j = 1; j < BUFFERS; j++)
  66. printf(", %x", address[k][j]);
  67. printf("n");
  68. }
  69.  
  70. return 0;
  71. }
  72.  
  73. TESTMEM = 10000000000
  74. BUFFERS = 1
  75. Size (kB) Bandwidth (GB/s) Iterations Addresses
  76. 8 52.79 1220703 90b010
  77. 16 56.48 610351 90b010
  78. 24 57.01 406901 90b010
  79. 28 57.13 348772 90b010
  80. 32 45.40 305175 90b010
  81. 36 38.11 271267 90b010
  82. 40 38.02 244140 90b010
  83. 48 38.12 203450 90b010
  84. 64 37.51 152587 90b010
  85. 128 36.89 76293 90b010
  86. 256 35.58 38146 d760f010
  87. 384 31.01 25431 d75ef010
  88. 512 26.79 19073 d75cf010
  89. 768 26.20 12715 d758f010
  90. 1024 26.20 9536 d754f010
  91. 1025 18.30 9527 90b010
  92. 2048 18.29 4768 d744f010
  93. 4096 18.29 2384 d724f010
  94. 8192 18.31 1192 d6e4f010
  95. 16384 18.31 596 d664f010
  96. 200000 18.32 48 cb2ff010
  97.  
  98. level = 3
  99. coherency_line_size = 64
  100. number_of_sets = 8192
  101. physical_line_partition = 1
  102. shared_cpu_list = 0-7
  103. shared_cpu_map = ff
  104. size = 8192K
  105. type = Unified
  106. ways_of_associativity = 16
  107.  
  108. TESTMEM = 10000000000
  109. BUFFERS = 2
  110. Size (kB) Bandwidth (GB/s) Iterations Addresses
  111. 8 54.15 1220703 e59010, e5b020
  112. 16 51.52 610351 e59010, e5d020
  113. 24 38.94 406901 e59010, e5f020
  114. 28 38.53 348772 e59010, e60020
  115. 32 38.31 305175 e59010, e61020
  116. 36 38.29 271267 e59010, e62020
  117. 40 38.29 244140 e59010, e63020
  118. 48 37.46 203450 e59010, e65020
  119. 64 36.93 152587 e59010, e69020
  120. 128 35.67 76293 e59010, 63769010
  121. 256 27.21 38146 63724010, 636e3010
  122. 384 26.26 25431 63704010, 636a3010
  123. 512 26.19 19073 636e4010, 63663010
  124. 768 26.20 12715 636a4010, 635e3010
  125. 1024 26.16 9536 63664010, 63563010
  126. 1025 18.29 9527 e59010, f59420
  127. 2048 18.23 4768 63564010, 63363010
  128. 4096 18.27 2384 63364010, 62f63010
  129. 8192 18.29 1192 62f64010, 62763010
  130. 16384 18.31 596 62764010, 61763010
  131. 200000 18.31 48 57414010, 4b0c3010
  132.  
  133. perf stat -e dTLB-loads,dTLB-load-misses,dTLB-stores,dTLB-store-misses -r 100 ./a.out 2> perfout.txt
  134.  
  135. Performance counter stats for './a.out' (100 runs):
  136.  
  137. 1,508,798 dTLB-loads ( +- 0.02% )
  138. 0 dTLB-load-misses # 0.00% of all dTLB cache hits
  139. 625,967,550 dTLB-stores ( +- 0.00% )
  140. 1,503 dTLB-store-misses ( +- 0.79% )
  141.  
  142. 0.360471583 seconds time elapsed ( +- 0.79% )
  143.  
  144. Performance counter stats for './a.out' (100 runs):
  145.  
  146. 1,670,402 dTLB-loads ( +- 0.09% )
  147. 0 dTLB-load-misses # 0.00% of all dTLB cache hits
  148. 626,099,850 dTLB-stores ( +- 0.00% )
  149. 2,115 dTLB-store-misses ( +- 2.19% )
  150.  
  151. 0.503913416 seconds time elapsed ( +- 0.06% )
  152.  
  153. 2.35 │a0:┌─+movdqa %xmm8,(%rcx)
  154. 54.90 │ │ movdqa %xmm8,0x10(%rcx)
  155. 32.85 │ │ movdqa %xmm8,0x20(%rcx)
  156. 1.73 │ │ movdqa %xmm8,0x30(%rcx)
  157. 8.11 │ │ add $0x40,%rcx
  158. 0.03 │ │ cmp %rcx,%rdx
  159. │ └──jne a0
  160.  
  161. │a00:┌─+lea -0x80(%r8),%r8
  162. 0.01 │ │ cmp $0x80,%r8
  163. 5.33 │ │ movdqa %xmm0,(%rdi)
  164. 4.67 │ │ movdqa %xmm0,0x10(%rdi)
  165. 6.69 │ │ movdqa %xmm0,0x20(%rdi)
  166. 31.23 │ │ movdqa %xmm0,0x30(%rdi)
  167. 18.35 │ │ movdqa %xmm0,0x40(%rdi)
  168. 0.27 │ │ movdqa %xmm0,0x50(%rdi)
  169. 3.24 │ │ movdqa %xmm0,0x60(%rdi)
  170. 16.36 │ │ movdqa %xmm0,0x70(%rdi)
  171. 13.76 │ │ lea 0x80(%rdi),%rdi
  172. │ └──jge a00
  173.  
  174. │a60:┌─+lea -0x80(%r8),%r8
  175. 0.15 │ │ cmp $0x80,%r8
  176. 1.36 │ │ movntd %xmm0,(%rdi)
  177. 0.24 │ │ movntd %xmm0,0x10(%rdi)
  178. 1.49 │ │ movntd %xmm0,0x20(%rdi)
  179. 44.89 │ │ movntd %xmm0,0x30(%rdi)
  180. 5.46 │ │ movntd %xmm0,0x40(%rdi)
  181. 0.02 │ │ movntd %xmm0,0x50(%rdi)
  182. 0.74 │ │ movntd %xmm0,0x60(%rdi)
  183. 40.14 │ │ movntd %xmm0,0x70(%rdi)
  184. 5.50 │ │ lea 0x80(%rdi),%rdi
  185. │ └──jge a60
  186.  
  187. L(byte32sse2_pre):
  188.  
  189. mov __x86_shared_cache_size(%rip),%r9d # The largest cache size
  190. cmp %r9,%r8
  191. ja L(sse2_nt_move_pre)
  192.  
  193. long int __x86_64_shared_cache_size attribute_hidden = 1024 * 1024;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement