CUDA bandwidth test

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <stdint.h>

#include "cuda_runtime.h"
#include "curand_kernel.h"
#include "device_launch_parameters.h"

#define GRID_SIZE  8192
#define BLOCK_SIZE 256
#define MEGABYTE (1024 * 1024)
#define THREADS_PER_HASH 8
#define ITERATIONS 16
#define FNV_PRIME   0x01000193

#define fnv(x,y) ((x) * FNV_PRIME ^(y))
#define random() (rand() * rand())

#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \
                 __FUNCTION__, __LINE__, cudaGetErrorString(err) );   \
        exit(EXIT_FAILURE);                                           \
        }                                                             \
} while (0)

typedef union
{
    uint32_t uint32s[128 / sizeof(uint32_t)];
    uint4    uint4s[128 / sizeof(uint4)];
} hash128_t;

__constant__ hash128_t * d_buffer;
__constant__ unsigned int d_buffer_size;

__device__ uint32_t fnv_reduce(uint4 v)
{
    return fnv(fnv(fnv(v.x, v.y), v.z), v.w);
}

__global__ void test(int search, volatile unsigned int * num_results)
{
    const unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x;

    const int thread_id = threadIdx.x &  (THREADS_PER_HASH - 1);
    const int start_lane = threadIdx.x & ~(THREADS_PER_HASH - 1);

    unsigned int s = gid;
    unsigned int r;

    for (int i = 0; i < THREADS_PER_HASH; i++) {

        for (int j = 0; j < ITERATIONS; j++) {
            unsigned int index = __shfl(s, start_lane + i)  % d_buffer_size;
            uint4 v = d_buffer[index].uint4s[thread_id];
            s = fnv_reduce(v);
        }

        s = __shfl(s, start_lane + i);
        if (i == thread_id) {
            r = s;
        }
    }

    if (search == r) {
        atomicInc(const_cast<unsigned int *>(num_results), UINT_MAX);
        __threadfence_system();
    }
}

int main(int argc, char *argv[])
{
    unsigned int buffer_size;

    if (argc == 2)
        buffer_size = atoi(argv[1]) * MEGABYTE;
    else
        buffer_size = 1024 * MEGABYTE;

    unsigned int * buffer = (unsigned int *)malloc(buffer_size);

    printf("Creating buffer of size %u bytes...\n", buffer_size);
    srand(time(NULL));

    for (unsigned int i = 0; i < buffer_size / 4; i++) {
        buffer[i] = random();
    }

    hash128_t * h_buffer;
    volatile unsigned int * num_results;
    unsigned int h_buffer_size = buffer_size / sizeof(hash128_t);


    CUDA_SAFE_CALL(cudaSetDevice(0));
    CUDA_SAFE_CALL(cudaMallocHost((void**)&num_results, sizeof(unsigned int)));
    CUDA_SAFE_CALL(cudaMalloc((void**)&h_buffer, buffer_size));
    CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(h_buffer), buffer, buffer_size, cudaMemcpyHostToDevice));
    CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_buffer, &h_buffer, sizeof(hash128_t *)));
    CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_buffer_size, &h_buffer_size, sizeof(unsigned int)));

    num_results[0] = 0;

    unsigned int target;
    target = random();

    cudaEvent_t start, stop;
    CUDA_SAFE_CALL(cudaEventCreate(&start));
    CUDA_SAFE_CALL(cudaEventCreate(&stop));
    cudaEventRecord(start, nullptr);
    int count = 16;
    for (int i = 0; i < count; i++) {
        test << <GRID_SIZE, BLOCK_SIZE >> >(target, num_results);
        CUDA_SAFE_CALL(cudaGetLastError());
        CUDA_SAFE_CALL(cudaDeviceSynchronize());
    }
    cudaEventRecord(stop, nullptr);
    cudaFree(h_buffer);

    float duration;
    cudaEventElapsedTime(&duration, start, stop);
    printf("%f GB/s\n", (1000.0f / duration) * count * sizeof(uint4) * THREADS_PER_HASH * ITERATIONS * GRID_SIZE * BLOCK_SIZE / static_cast<float>(1 << 30));

    return num_results[0];
}