Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <sys/time.h>
- #define SIZE_KB {8, 16, 24, 28, 32, 36, 40, 48, 64, 128, 256, 384, 512, 768, 1024, 1025, 2048, 4096, 8192, 16384, 200000}
- #define TESTMEM 10000000000 // Approximate, in bytes
- #define BUFFERS 1
- double timer(void)
- {
- struct timeval ts;
- double ans;
- gettimeofday(&ts, NULL);
- ans = ts.tv_sec + ts.tv_usec*1.0e-6;
- return ans;
- }
- int main(int argc, char **argv)
- {
- double *x[BUFFERS];
- double t1, t2;
- int kbsizes[] = SIZE_KB;
- double bandwidth[sizeof(kbsizes)/sizeof(int)];
- int iterations[sizeof(kbsizes)/sizeof(int)];
- double *address[sizeof(kbsizes)/sizeof(int)][BUFFERS];
- int i, j, k;
- for (k = 0; k < sizeof(kbsizes)/sizeof(int); k++)
- iterations[k] = TESTMEM/(kbsizes[k]*1024);
- for (k = 0; k < sizeof(kbsizes)/sizeof(int); k++)
- {
- // Allocate
- for (j = 0; j < BUFFERS; j++)
- {
- x[j] = (double *) malloc(kbsizes[k]*1024);
- address[k][j] = x[j];
- memset(x[j], 0, kbsizes[k]*1024);
- }
- // Measure
- t1 = timer();
- for (i = 0; i < iterations[k]; i++)
- {
- for (j = 0; j < BUFFERS; j++)
- memset(x[j], 0xff, kbsizes[k]*1024);
- }
- t2 = timer();
- bandwidth[k] = (BUFFERS*kbsizes[k]*iterations[k])/1024.0/1024.0/(t2-t1);
- // Free
- for (j = 0; j < BUFFERS; j++)
- free(x[j]);
- }
- printf("TESTMEM = %ldn", TESTMEM);
- printf("BUFFERS = %dn", BUFFERS);
- printf("Size (kB)tBandwidth (GB/s)tIterationstAddressesn");
- for (k = 0; k < sizeof(kbsizes)/sizeof(int); k++)
- {
- printf("%7dtt%.2fttt%dtt%x", kbsizes[k], bandwidth[k], iterations[k], address[k][0]);
- for (j = 1; j < BUFFERS; j++)
- printf(", %x", address[k][j]);
- printf("n");
- }
- return 0;
- }
- TESTMEM = 10000000000
- BUFFERS = 1
- Size (kB) Bandwidth (GB/s) Iterations Addresses
- 8 52.79 1220703 90b010
- 16 56.48 610351 90b010
- 24 57.01 406901 90b010
- 28 57.13 348772 90b010
- 32 45.40 305175 90b010
- 36 38.11 271267 90b010
- 40 38.02 244140 90b010
- 48 38.12 203450 90b010
- 64 37.51 152587 90b010
- 128 36.89 76293 90b010
- 256 35.58 38146 d760f010
- 384 31.01 25431 d75ef010
- 512 26.79 19073 d75cf010
- 768 26.20 12715 d758f010
- 1024 26.20 9536 d754f010
- 1025 18.30 9527 90b010
- 2048 18.29 4768 d744f010
- 4096 18.29 2384 d724f010
- 8192 18.31 1192 d6e4f010
- 16384 18.31 596 d664f010
- 200000 18.32 48 cb2ff010
- level = 3
- coherency_line_size = 64
- number_of_sets = 8192
- physical_line_partition = 1
- shared_cpu_list = 0-7
- shared_cpu_map = ff
- size = 8192K
- type = Unified
- ways_of_associativity = 16
- TESTMEM = 10000000000
- BUFFERS = 2
- Size (kB) Bandwidth (GB/s) Iterations Addresses
- 8 54.15 1220703 e59010, e5b020
- 16 51.52 610351 e59010, e5d020
- 24 38.94 406901 e59010, e5f020
- 28 38.53 348772 e59010, e60020
- 32 38.31 305175 e59010, e61020
- 36 38.29 271267 e59010, e62020
- 40 38.29 244140 e59010, e63020
- 48 37.46 203450 e59010, e65020
- 64 36.93 152587 e59010, e69020
- 128 35.67 76293 e59010, 63769010
- 256 27.21 38146 63724010, 636e3010
- 384 26.26 25431 63704010, 636a3010
- 512 26.19 19073 636e4010, 63663010
- 768 26.20 12715 636a4010, 635e3010
- 1024 26.16 9536 63664010, 63563010
- 1025 18.29 9527 e59010, f59420
- 2048 18.23 4768 63564010, 63363010
- 4096 18.27 2384 63364010, 62f63010
- 8192 18.29 1192 62f64010, 62763010
- 16384 18.31 596 62764010, 61763010
- 200000 18.31 48 57414010, 4b0c3010
- perf stat -e dTLB-loads,dTLB-load-misses,dTLB-stores,dTLB-store-misses -r 100 ./a.out 2> perfout.txt
- Performance counter stats for './a.out' (100 runs):
- 1,508,798 dTLB-loads ( +- 0.02% )
- 0 dTLB-load-misses # 0.00% of all dTLB cache hits
- 625,967,550 dTLB-stores ( +- 0.00% )
- 1,503 dTLB-store-misses ( +- 0.79% )
- 0.360471583 seconds time elapsed ( +- 0.79% )
- Performance counter stats for './a.out' (100 runs):
- 1,670,402 dTLB-loads ( +- 0.09% )
- 0 dTLB-load-misses # 0.00% of all dTLB cache hits
- 626,099,850 dTLB-stores ( +- 0.00% )
- 2,115 dTLB-store-misses ( +- 2.19% )
- 0.503913416 seconds time elapsed ( +- 0.06% )
- 2.35 │a0:┌─+movdqa %xmm8,(%rcx)
- 54.90 │ │ movdqa %xmm8,0x10(%rcx)
- 32.85 │ │ movdqa %xmm8,0x20(%rcx)
- 1.73 │ │ movdqa %xmm8,0x30(%rcx)
- 8.11 │ │ add $0x40,%rcx
- 0.03 │ │ cmp %rcx,%rdx
- │ └──jne a0
- │a00:┌─+lea -0x80(%r8),%r8
- 0.01 │ │ cmp $0x80,%r8
- 5.33 │ │ movdqa %xmm0,(%rdi)
- 4.67 │ │ movdqa %xmm0,0x10(%rdi)
- 6.69 │ │ movdqa %xmm0,0x20(%rdi)
- 31.23 │ │ movdqa %xmm0,0x30(%rdi)
- 18.35 │ │ movdqa %xmm0,0x40(%rdi)
- 0.27 │ │ movdqa %xmm0,0x50(%rdi)
- 3.24 │ │ movdqa %xmm0,0x60(%rdi)
- 16.36 │ │ movdqa %xmm0,0x70(%rdi)
- 13.76 │ │ lea 0x80(%rdi),%rdi
- │ └──jge a00
- │a60:┌─+lea -0x80(%r8),%r8
- 0.15 │ │ cmp $0x80,%r8
- 1.36 │ │ movntd %xmm0,(%rdi)
- 0.24 │ │ movntd %xmm0,0x10(%rdi)
- 1.49 │ │ movntd %xmm0,0x20(%rdi)
- 44.89 │ │ movntd %xmm0,0x30(%rdi)
- 5.46 │ │ movntd %xmm0,0x40(%rdi)
- 0.02 │ │ movntd %xmm0,0x50(%rdi)
- 0.74 │ │ movntd %xmm0,0x60(%rdi)
- 40.14 │ │ movntd %xmm0,0x70(%rdi)
- 5.50 │ │ lea 0x80(%rdi),%rdi
- │ └──jge a60
- L(byte32sse2_pre):
- mov __x86_shared_cache_size(%rip),%r9d # The largest cache size
- cmp %r9,%r8
- ja L(sse2_nt_move_pre)
- long int __x86_64_shared_cache_size attribute_hidden = 1024 * 1024;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement