Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <iostream>
- #include <sys/time.h>
- #include <stdlib.h>
- #include <sys/mman.h>
- #include <omp.h>
- /*
- 77 GFLOPS on a single Broadwell core, by Paul Sutter
- to compile
- g++ peakgflops.cpp -o peakgflops -std=c++11 -pthread -O3 -mavx2 -mfma -fabi-version=0 -ffp-contract=fast
- to get assembler:
- g++ peakgflops.cpp -S -std=c++11 -pthread -O3 -mavx2 -mfma -fabi-version=0 -ffp-contract=fast
- */
- #include <immintrin.h> // For AVX instructions
- #define usec(b, a) ((b.tv_sec - a.tv_sec) * 1000000LL + (b.tv_usec - a.tv_usec))
- #define LENGTH (1<<26)
- #define CYCLES 4LL
- #define ACCUMULATORS 10LL
- #define WIDTH 8LL
- float frand() {
- return (std::rand() / (float) RAND_MAX) - 0.5;
- }
- int main() {
- struct timeval start, end;
- std::srand(time(NULL));
- // vector
- __m256* vec;
- register __m256* v;
- unsigned int bytes = sizeof(__m256) * LENGTH;
- // coefficients
- __m256 coeff[CYCLES];
- for (int i=0; i<CYCLES; i++) {
- coeff[i] = _mm256_set1_ps(frand());
- }
- // allocate vectors
- vec = (__m256*) mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
- // initialize vectors
- gettimeofday(&start, NULL);
- v = vec;
- int count = LENGTH;
- while (count--) {
- *(v++) = _mm256_set1_ps(frand());
- }
- gettimeofday(&end, NULL);
- std::cout << "initialized " << bytes << " bytes, usec " << usec(end,start) << std::endl;
- // setup registers
- register __m256 x0;
- register __m256 x1;
- register __m256 c0;
- register __m256 c1;
- register __m256 a0 = _mm256_set1_ps(frand());
- register __m256 a1 = _mm256_set1_ps(frand());
- register __m256 a2 = _mm256_set1_ps(frand());
- register __m256 a3 = _mm256_set1_ps(frand());
- register __m256 a4 = _mm256_set1_ps(frand());
- register __m256 a5 = _mm256_set1_ps(frand());
- register __m256 a6 = _mm256_set1_ps(frand());
- register __m256 a7 = _mm256_set1_ps(frand());
- register __m256 a8 = _mm256_set1_ps(frand());
- register __m256 a9 = _mm256_set1_ps(frand());
- v = vec;
- x0 = *(v++);
- c0 = _mm256_set1_ps(frand());
- c1 = _mm256_set1_ps(frand());
- count = LENGTH;
- gettimeofday(&start, NULL);
- while (count-=2) {
- x1 = *(v++); // sequential DRAM read
- a0 = _mm256_fmadd_ps(c0,x0,a0); a1 = _mm256_fmadd_ps(c0,x0,a1);
- a2 = _mm256_fmadd_ps(c0,x0,a2); a3 = _mm256_fmadd_ps(c0,x0,a3);
- a4 = _mm256_fmadd_ps(c0,x0,a4); a5 = _mm256_fmadd_ps(c0,x0,a5);
- a6 = _mm256_fmadd_ps(c0,x0,a6); a7 = _mm256_fmadd_ps(c0,x0,a7);
- a8 = _mm256_fmadd_ps(c0,x0,a8); a9 = _mm256_fmadd_ps(c0,x0,a9);
- a0 = _mm256_fmadd_ps(c1,x0,a0); a1 = _mm256_fmadd_ps(c1,x0,a1);
- a2 = _mm256_fmadd_ps(c1,x0,a2); a3 = _mm256_fmadd_ps(c1,x0,a3);
- a4 = _mm256_fmadd_ps(c1,x0,a4); a5 = _mm256_fmadd_ps(c1,x0,a5);
- a6 = _mm256_fmadd_ps(c1,x0,a6); a7 = _mm256_fmadd_ps(c1,x0,a7);
- a8 = _mm256_fmadd_ps(c1,x0,a8); a9 = _mm256_fmadd_ps(c1,x0,a9);
- a0 = _mm256_fmadd_ps(c0,x0,a0); a1 = _mm256_fmadd_ps(c0,x0,a1);
- a2 = _mm256_fmadd_ps(c0,x0,a2); a3 = _mm256_fmadd_ps(c0,x0,a3);
- a4 = _mm256_fmadd_ps(c0,x0,a4); a5 = _mm256_fmadd_ps(c0,x0,a5);
- a6 = _mm256_fmadd_ps(c0,x0,a6); a7 = _mm256_fmadd_ps(c0,x0,a7);
- a8 = _mm256_fmadd_ps(c0,x0,a8); a9 = _mm256_fmadd_ps(c0,x0,a9);
- a0 = _mm256_fmadd_ps(c1,x0,a0); a1 = _mm256_fmadd_ps(c1,x0,a1);
- a2 = _mm256_fmadd_ps(c1,x0,a2); a3 = _mm256_fmadd_ps(c1,x0,a3);
- a4 = _mm256_fmadd_ps(c1,x0,a4); a5 = _mm256_fmadd_ps(c1,x0,a5);
- a6 = _mm256_fmadd_ps(c1,x0,a6); a7 = _mm256_fmadd_ps(c1,x0,a7);
- a8 = _mm256_fmadd_ps(c1,x0,a8); a9 = _mm256_fmadd_ps(c1,x0,a9);
- x0 = *(v++); // sequential DRAM read
- a0 = _mm256_fmadd_ps(c0,x1,a0); a1 = _mm256_fmadd_ps(c0,x1,a1);
- a2 = _mm256_fmadd_ps(c0,x1,a2); a3 = _mm256_fmadd_ps(c0,x1,a3);
- a4 = _mm256_fmadd_ps(c0,x1,a4); a5 = _mm256_fmadd_ps(c0,x1,a5);
- a6 = _mm256_fmadd_ps(c0,x1,a6); a7 = _mm256_fmadd_ps(c0,x1,a7);
- a8 = _mm256_fmadd_ps(c0,x1,a8); a9 = _mm256_fmadd_ps(c0,x1,a9);
- a0 = _mm256_fmadd_ps(c1,x1,a0); a1 = _mm256_fmadd_ps(c1,x1,a1);
- a2 = _mm256_fmadd_ps(c1,x1,a2); a3 = _mm256_fmadd_ps(c1,x1,a3);
- a4 = _mm256_fmadd_ps(c1,x1,a4); a5 = _mm256_fmadd_ps(c1,x1,a5);
- a6 = _mm256_fmadd_ps(c1,x1,a6); a7 = _mm256_fmadd_ps(c1,x1,a7);
- a8 = _mm256_fmadd_ps(c1,x1,a8); a9 = _mm256_fmadd_ps(c1,x1,a9);
- a0 = _mm256_fmadd_ps(c0,x1,a0); a1 = _mm256_fmadd_ps(c0,x1,a1);
- a2 = _mm256_fmadd_ps(c0,x1,a2); a3 = _mm256_fmadd_ps(c0,x1,a3);
- a4 = _mm256_fmadd_ps(c0,x1,a4); a5 = _mm256_fmadd_ps(c0,x1,a5);
- a6 = _mm256_fmadd_ps(c0,x1,a6); a7 = _mm256_fmadd_ps(c0,x1,a7);
- a8 = _mm256_fmadd_ps(c0,x1,a8); a9 = _mm256_fmadd_ps(c0,x1,a9);
- a0 = _mm256_fmadd_ps(c1,x1,a0); a1 = _mm256_fmadd_ps(c1,x1,a1);
- a2 = _mm256_fmadd_ps(c1,x1,a2); a3 = _mm256_fmadd_ps(c1,x1,a3);
- a4 = _mm256_fmadd_ps(c1,x1,a4); a5 = _mm256_fmadd_ps(c1,x1,a5);
- a6 = _mm256_fmadd_ps(c1,x1,a6); a7 = _mm256_fmadd_ps(c1,x1,a7);
- a8 = _mm256_fmadd_ps(c1,x1,a8); a9 = _mm256_fmadd_ps(c1,x1,a9);
- }
- gettimeofday(&end, NULL);
- long long flop = LENGTH * CYCLES * ACCUMULATORS * WIDTH * 2LL; // 2 flops per op
- long long dur = usec(end,start);
- std::cout << "ops 10x10 " << flop << " flops, usec " << dur << ", mflops " << (flop/dur)
- << ", MB/s " << (bytes*1000000LL) / (dur*1024*1024) << std::endl;
- // sum up the accumulators and output them so that the optimizer doesnt get greedy
- union U {
- __m256 a256;
- float a[8];
- } u;
- u.a256 = a0+a1+a2+a3+a4+a5+a6+a7+a8+a9;
- std::cout << "results:" << u.a[0]+u.a[1]+u.a[2]+u.a[3]+u.a[4]+u.a[5]+u.a[6]+u.a[7] << std::endl;
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement