77GFLOPS on a single Broadwell core

#include <iostream>
#include <sys/time.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <omp.h>

/*

	77 GFLOPS on a single Broadwell core, by Paul Sutter

	to compile

		g++ peakgflops.cpp -o peakgflops -std=c++11 -pthread -O3 -mavx2 -mfma -fabi-version=0 -ffp-contract=fast

	to get assembler:

		g++ peakgflops.cpp -S -std=c++11 -pthread -O3 -mavx2 -mfma -fabi-version=0 -ffp-contract=fast

*/

#include <immintrin.h> // For AVX instructions

#define usec(b, a) ((b.tv_sec - a.tv_sec) * 1000000LL + (b.tv_usec - a.tv_usec))

#define LENGTH (1<<26)
#define CYCLES 4LL
#define ACCUMULATORS 10LL
#define WIDTH 8LL

float frand() {
	return (std::rand() / (float) RAND_MAX) - 0.5;
}

int main() {
	struct timeval start, end;
	std::srand(time(NULL));

	// vector
	__m256* vec;
	register __m256* v;
	unsigned int bytes = sizeof(__m256) * LENGTH;

	// coefficients
	__m256 coeff[CYCLES];
	for (int i=0; i<CYCLES; i++) {
		coeff[i] = _mm256_set1_ps(frand());
	}

	// allocate vectors
	vec = (__m256*) mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

	// initialize vectors
	gettimeofday(&start, NULL);
	v = vec;
	int count = LENGTH;
	while (count--) {
		*(v++) = _mm256_set1_ps(frand());
	}
	gettimeofday(&end, NULL);

	std::cout << "initialized " << bytes << " bytes, usec " << usec(end,start) << std::endl;

	// setup registers
	register __m256 x0;
	register __m256 x1;
	register __m256 c0;
	register __m256 c1;

	register __m256 a0 = _mm256_set1_ps(frand());
	register __m256 a1 = _mm256_set1_ps(frand());
	register __m256 a2 = _mm256_set1_ps(frand());
	register __m256 a3 = _mm256_set1_ps(frand());
	register __m256 a4 = _mm256_set1_ps(frand());
	register __m256 a5 = _mm256_set1_ps(frand());
	register __m256 a6 = _mm256_set1_ps(frand());
	register __m256 a7 = _mm256_set1_ps(frand());
	register __m256 a8 = _mm256_set1_ps(frand());
	register __m256 a9 = _mm256_set1_ps(frand());

	v = vec;
	x0 = *(v++);
	c0 = _mm256_set1_ps(frand());
	c1 = _mm256_set1_ps(frand());
	count = LENGTH;

	gettimeofday(&start, NULL);
	while (count-=2) {

		x1 = *(v++); 	// sequential DRAM read

		a0 = _mm256_fmadd_ps(c0,x0,a0); a1 = _mm256_fmadd_ps(c0,x0,a1);
		a2 = _mm256_fmadd_ps(c0,x0,a2); a3 = _mm256_fmadd_ps(c0,x0,a3);
		a4 = _mm256_fmadd_ps(c0,x0,a4); a5 = _mm256_fmadd_ps(c0,x0,a5);
		a6 = _mm256_fmadd_ps(c0,x0,a6); a7 = _mm256_fmadd_ps(c0,x0,a7);
		a8 = _mm256_fmadd_ps(c0,x0,a8); a9 = _mm256_fmadd_ps(c0,x0,a9);

		a0 = _mm256_fmadd_ps(c1,x0,a0); a1 = _mm256_fmadd_ps(c1,x0,a1);
		a2 = _mm256_fmadd_ps(c1,x0,a2); a3 = _mm256_fmadd_ps(c1,x0,a3);
		a4 = _mm256_fmadd_ps(c1,x0,a4); a5 = _mm256_fmadd_ps(c1,x0,a5);
		a6 = _mm256_fmadd_ps(c1,x0,a6); a7 = _mm256_fmadd_ps(c1,x0,a7);
		a8 = _mm256_fmadd_ps(c1,x0,a8); a9 = _mm256_fmadd_ps(c1,x0,a9);

		a0 = _mm256_fmadd_ps(c0,x0,a0); a1 = _mm256_fmadd_ps(c0,x0,a1);
		a2 = _mm256_fmadd_ps(c0,x0,a2); a3 = _mm256_fmadd_ps(c0,x0,a3);
		a4 = _mm256_fmadd_ps(c0,x0,a4); a5 = _mm256_fmadd_ps(c0,x0,a5);
		a6 = _mm256_fmadd_ps(c0,x0,a6); a7 = _mm256_fmadd_ps(c0,x0,a7);
		a8 = _mm256_fmadd_ps(c0,x0,a8); a9 = _mm256_fmadd_ps(c0,x0,a9);

		a0 = _mm256_fmadd_ps(c1,x0,a0); a1 = _mm256_fmadd_ps(c1,x0,a1);
		a2 = _mm256_fmadd_ps(c1,x0,a2); a3 = _mm256_fmadd_ps(c1,x0,a3);
		a4 = _mm256_fmadd_ps(c1,x0,a4); a5 = _mm256_fmadd_ps(c1,x0,a5);
		a6 = _mm256_fmadd_ps(c1,x0,a6); a7 = _mm256_fmadd_ps(c1,x0,a7);
		a8 = _mm256_fmadd_ps(c1,x0,a8); a9 = _mm256_fmadd_ps(c1,x0,a9);

		x0 = *(v++);	// sequential DRAM read

		a0 = _mm256_fmadd_ps(c0,x1,a0); a1 = _mm256_fmadd_ps(c0,x1,a1);
		a2 = _mm256_fmadd_ps(c0,x1,a2); a3 = _mm256_fmadd_ps(c0,x1,a3);
		a4 = _mm256_fmadd_ps(c0,x1,a4); a5 = _mm256_fmadd_ps(c0,x1,a5);
		a6 = _mm256_fmadd_ps(c0,x1,a6); a7 = _mm256_fmadd_ps(c0,x1,a7);
		a8 = _mm256_fmadd_ps(c0,x1,a8); a9 = _mm256_fmadd_ps(c0,x1,a9);

		a0 = _mm256_fmadd_ps(c1,x1,a0); a1 = _mm256_fmadd_ps(c1,x1,a1);
		a2 = _mm256_fmadd_ps(c1,x1,a2); a3 = _mm256_fmadd_ps(c1,x1,a3);
		a4 = _mm256_fmadd_ps(c1,x1,a4); a5 = _mm256_fmadd_ps(c1,x1,a5);
		a6 = _mm256_fmadd_ps(c1,x1,a6); a7 = _mm256_fmadd_ps(c1,x1,a7);
		a8 = _mm256_fmadd_ps(c1,x1,a8); a9 = _mm256_fmadd_ps(c1,x1,a9);

		a0 = _mm256_fmadd_ps(c0,x1,a0); a1 = _mm256_fmadd_ps(c0,x1,a1);
		a2 = _mm256_fmadd_ps(c0,x1,a2); a3 = _mm256_fmadd_ps(c0,x1,a3);
		a4 = _mm256_fmadd_ps(c0,x1,a4); a5 = _mm256_fmadd_ps(c0,x1,a5);
		a6 = _mm256_fmadd_ps(c0,x1,a6); a7 = _mm256_fmadd_ps(c0,x1,a7);
		a8 = _mm256_fmadd_ps(c0,x1,a8); a9 = _mm256_fmadd_ps(c0,x1,a9);

		a0 = _mm256_fmadd_ps(c1,x1,a0); a1 = _mm256_fmadd_ps(c1,x1,a1);
		a2 = _mm256_fmadd_ps(c1,x1,a2); a3 = _mm256_fmadd_ps(c1,x1,a3);
		a4 = _mm256_fmadd_ps(c1,x1,a4); a5 = _mm256_fmadd_ps(c1,x1,a5);
		a6 = _mm256_fmadd_ps(c1,x1,a6); a7 = _mm256_fmadd_ps(c1,x1,a7);
		a8 = _mm256_fmadd_ps(c1,x1,a8); a9 = _mm256_fmadd_ps(c1,x1,a9);
	}
	gettimeofday(&end, NULL);

	long long flop = LENGTH * CYCLES * ACCUMULATORS * WIDTH * 2LL; // 2 flops per op
	long long dur = usec(end,start);
	std::cout  	<< "ops 10x10 " << flop << " flops, usec " << dur << ", mflops " << (flop/dur)
				<< ", MB/s " << (bytes*1000000LL) / (dur*1024*1024) << std::endl;

	// sum up the accumulators and output them so that the optimizer doesnt get greedy

	union U {
    	__m256 a256;
    	float a[8];
	} u;
	u.a256 = a0+a1+a2+a3+a4+a5+a6+a7+a8+a9;

	std::cout << "results:" << u.a[0]+u.a[1]+u.a[2]+u.a[3]+u.a[4]+u.a[5]+u.a[6]+u.a[7] << std::endl;

	return 0;
}