Advertisement
Guest User

77GFLOPS on a single Broadwell core

a guest
Mar 26th, 2017
418
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.43 KB | None | 0 0
  1. #include <iostream>
  2. #include <sys/time.h>
  3. #include <stdlib.h>
  4. #include <sys/mman.h>
  5. #include <omp.h>
  6.  
  7. /*
  8.  
  9. 77 GFLOPS on a single Broadwell core, by Paul Sutter
  10.  
  11. to compile
  12.  
  13. g++ peakgflops.cpp -o peakgflops -std=c++11 -pthread -O3 -mavx2 -mfma -fabi-version=0 -ffp-contract=fast
  14.  
  15. to get assembler:
  16.  
  17. g++ peakgflops.cpp -S -std=c++11 -pthread -O3 -mavx2 -mfma -fabi-version=0 -ffp-contract=fast
  18.  
  19. */
  20.  
  21. #include <immintrin.h> // For AVX instructions
  22.  
  23. #define usec(b, a) ((b.tv_sec - a.tv_sec) * 1000000LL + (b.tv_usec - a.tv_usec))
  24.  
  25. #define LENGTH (1<<26)
  26. #define CYCLES 4LL
  27. #define ACCUMULATORS 10LL
  28. #define WIDTH 8LL
  29.  
  30. float frand() {
  31. return (std::rand() / (float) RAND_MAX) - 0.5;
  32. }
  33.  
  34. int main() {
  35. struct timeval start, end;
  36. std::srand(time(NULL));
  37.  
  38. // vector
  39. __m256* vec;
  40. register __m256* v;
  41. unsigned int bytes = sizeof(__m256) * LENGTH;
  42.  
  43. // coefficients
  44. __m256 coeff[CYCLES];
  45. for (int i=0; i<CYCLES; i++) {
  46. coeff[i] = _mm256_set1_ps(frand());
  47. }
  48.  
  49. // allocate vectors
  50. vec = (__m256*) mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
  51.  
  52. // initialize vectors
  53. gettimeofday(&start, NULL);
  54. v = vec;
  55. int count = LENGTH;
  56. while (count--) {
  57. *(v++) = _mm256_set1_ps(frand());
  58. }
  59. gettimeofday(&end, NULL);
  60.  
  61. std::cout << "initialized " << bytes << " bytes, usec " << usec(end,start) << std::endl;
  62.  
  63. // setup registers
  64. register __m256 x0;
  65. register __m256 x1;
  66. register __m256 c0;
  67. register __m256 c1;
  68.  
  69. register __m256 a0 = _mm256_set1_ps(frand());
  70. register __m256 a1 = _mm256_set1_ps(frand());
  71. register __m256 a2 = _mm256_set1_ps(frand());
  72. register __m256 a3 = _mm256_set1_ps(frand());
  73. register __m256 a4 = _mm256_set1_ps(frand());
  74. register __m256 a5 = _mm256_set1_ps(frand());
  75. register __m256 a6 = _mm256_set1_ps(frand());
  76. register __m256 a7 = _mm256_set1_ps(frand());
  77. register __m256 a8 = _mm256_set1_ps(frand());
  78. register __m256 a9 = _mm256_set1_ps(frand());
  79.  
  80. v = vec;
  81. x0 = *(v++);
  82. c0 = _mm256_set1_ps(frand());
  83. c1 = _mm256_set1_ps(frand());
  84. count = LENGTH;
  85.  
  86. gettimeofday(&start, NULL);
  87. while (count-=2) {
  88.  
  89. x1 = *(v++); // sequential DRAM read
  90.  
  91. a0 = _mm256_fmadd_ps(c0,x0,a0); a1 = _mm256_fmadd_ps(c0,x0,a1);
  92. a2 = _mm256_fmadd_ps(c0,x0,a2); a3 = _mm256_fmadd_ps(c0,x0,a3);
  93. a4 = _mm256_fmadd_ps(c0,x0,a4); a5 = _mm256_fmadd_ps(c0,x0,a5);
  94. a6 = _mm256_fmadd_ps(c0,x0,a6); a7 = _mm256_fmadd_ps(c0,x0,a7);
  95. a8 = _mm256_fmadd_ps(c0,x0,a8); a9 = _mm256_fmadd_ps(c0,x0,a9);
  96.  
  97. a0 = _mm256_fmadd_ps(c1,x0,a0); a1 = _mm256_fmadd_ps(c1,x0,a1);
  98. a2 = _mm256_fmadd_ps(c1,x0,a2); a3 = _mm256_fmadd_ps(c1,x0,a3);
  99. a4 = _mm256_fmadd_ps(c1,x0,a4); a5 = _mm256_fmadd_ps(c1,x0,a5);
  100. a6 = _mm256_fmadd_ps(c1,x0,a6); a7 = _mm256_fmadd_ps(c1,x0,a7);
  101. a8 = _mm256_fmadd_ps(c1,x0,a8); a9 = _mm256_fmadd_ps(c1,x0,a9);
  102.  
  103. a0 = _mm256_fmadd_ps(c0,x0,a0); a1 = _mm256_fmadd_ps(c0,x0,a1);
  104. a2 = _mm256_fmadd_ps(c0,x0,a2); a3 = _mm256_fmadd_ps(c0,x0,a3);
  105. a4 = _mm256_fmadd_ps(c0,x0,a4); a5 = _mm256_fmadd_ps(c0,x0,a5);
  106. a6 = _mm256_fmadd_ps(c0,x0,a6); a7 = _mm256_fmadd_ps(c0,x0,a7);
  107. a8 = _mm256_fmadd_ps(c0,x0,a8); a9 = _mm256_fmadd_ps(c0,x0,a9);
  108.  
  109. a0 = _mm256_fmadd_ps(c1,x0,a0); a1 = _mm256_fmadd_ps(c1,x0,a1);
  110. a2 = _mm256_fmadd_ps(c1,x0,a2); a3 = _mm256_fmadd_ps(c1,x0,a3);
  111. a4 = _mm256_fmadd_ps(c1,x0,a4); a5 = _mm256_fmadd_ps(c1,x0,a5);
  112. a6 = _mm256_fmadd_ps(c1,x0,a6); a7 = _mm256_fmadd_ps(c1,x0,a7);
  113. a8 = _mm256_fmadd_ps(c1,x0,a8); a9 = _mm256_fmadd_ps(c1,x0,a9);
  114.  
  115. x0 = *(v++); // sequential DRAM read
  116.  
  117. a0 = _mm256_fmadd_ps(c0,x1,a0); a1 = _mm256_fmadd_ps(c0,x1,a1);
  118. a2 = _mm256_fmadd_ps(c0,x1,a2); a3 = _mm256_fmadd_ps(c0,x1,a3);
  119. a4 = _mm256_fmadd_ps(c0,x1,a4); a5 = _mm256_fmadd_ps(c0,x1,a5);
  120. a6 = _mm256_fmadd_ps(c0,x1,a6); a7 = _mm256_fmadd_ps(c0,x1,a7);
  121. a8 = _mm256_fmadd_ps(c0,x1,a8); a9 = _mm256_fmadd_ps(c0,x1,a9);
  122.  
  123. a0 = _mm256_fmadd_ps(c1,x1,a0); a1 = _mm256_fmadd_ps(c1,x1,a1);
  124. a2 = _mm256_fmadd_ps(c1,x1,a2); a3 = _mm256_fmadd_ps(c1,x1,a3);
  125. a4 = _mm256_fmadd_ps(c1,x1,a4); a5 = _mm256_fmadd_ps(c1,x1,a5);
  126. a6 = _mm256_fmadd_ps(c1,x1,a6); a7 = _mm256_fmadd_ps(c1,x1,a7);
  127. a8 = _mm256_fmadd_ps(c1,x1,a8); a9 = _mm256_fmadd_ps(c1,x1,a9);
  128.  
  129. a0 = _mm256_fmadd_ps(c0,x1,a0); a1 = _mm256_fmadd_ps(c0,x1,a1);
  130. a2 = _mm256_fmadd_ps(c0,x1,a2); a3 = _mm256_fmadd_ps(c0,x1,a3);
  131. a4 = _mm256_fmadd_ps(c0,x1,a4); a5 = _mm256_fmadd_ps(c0,x1,a5);
  132. a6 = _mm256_fmadd_ps(c0,x1,a6); a7 = _mm256_fmadd_ps(c0,x1,a7);
  133. a8 = _mm256_fmadd_ps(c0,x1,a8); a9 = _mm256_fmadd_ps(c0,x1,a9);
  134.  
  135. a0 = _mm256_fmadd_ps(c1,x1,a0); a1 = _mm256_fmadd_ps(c1,x1,a1);
  136. a2 = _mm256_fmadd_ps(c1,x1,a2); a3 = _mm256_fmadd_ps(c1,x1,a3);
  137. a4 = _mm256_fmadd_ps(c1,x1,a4); a5 = _mm256_fmadd_ps(c1,x1,a5);
  138. a6 = _mm256_fmadd_ps(c1,x1,a6); a7 = _mm256_fmadd_ps(c1,x1,a7);
  139. a8 = _mm256_fmadd_ps(c1,x1,a8); a9 = _mm256_fmadd_ps(c1,x1,a9);
  140. }
  141. gettimeofday(&end, NULL);
  142.  
  143. long long flop = LENGTH * CYCLES * ACCUMULATORS * WIDTH * 2LL; // 2 flops per op
  144. long long dur = usec(end,start);
  145. std::cout << "ops 10x10 " << flop << " flops, usec " << dur << ", mflops " << (flop/dur)
  146. << ", MB/s " << (bytes*1000000LL) / (dur*1024*1024) << std::endl;
  147.  
  148. // sum up the accumulators and output them so that the optimizer doesnt get greedy
  149.  
  150. union U {
  151. __m256 a256;
  152. float a[8];
  153. } u;
  154. u.a256 = a0+a1+a2+a3+a4+a5+a6+a7+a8+a9;
  155.  
  156. std::cout << "results:" << u.a[0]+u.a[1]+u.a[2]+u.a[3]+u.a[4]+u.a[5]+u.a[6]+u.a[7] << std::endl;
  157.  
  158. return 0;
  159. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement