Untitled

/*
    Fast GPU based reduction sum.
    Written by Jimmy Pettersson
    jimmy.pettersson@hpcsweden.se

*/

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>


#ifndef _WIN32
#include <unistd.h>
#include <sys/time.h>
#endif

#ifdef _WIN32
#include <windows.h>
#include <winbase.h>
#endif

#if __DEVICE_EMULATION__
#define DEBUG_SYNC __syncthreads();
#else
#define DEBUG_SYNC
#endif

#if (__CUDA_ARCH__ < 200)
#define int_mult(x,y)   __mul24(x,y)
#else
#define int_mult(x,y)   x*y
#endif


#if (__CUDA_ARCH__ >= 300)
#define REDUCE_REGISTERS 1 // Use __shfl instruction
#else
#define REDUCE_REGISTERS 0 // use shared memory
#endif


const int blockSize1 = 16384; // 32768


const int x_threads = 64;
const int NbXGroups = 4;

const int NbXWarps = x_threads/32;

float m_bandwidth = -1.0;

#ifdef _WIN32
double get_clock()
{

    LARGE_INTEGER ticksPerSecond;
    LARGE_INTEGER timeStamp;

    QueryPerformanceFrequency(&ticksPerSecond);
    QueryPerformanceCounter(&timeStamp);

    double sec = double(timeStamp.QuadPart)/(double)ticksPerSecond.QuadPart;

    return sec; // returns timestamp in secounds

}
#else
double get_clock()
{

unsigned long long int get_clock()
{
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return (unsigned long long int)tv.tv_usec + 1000000*tv.tv_sec;

}

}
#endif


template<int x_dim, int y_dim>
__device__ float warp_reduce_registers(float myVal)
{

    int warpIndex = threadIdx.x%32;

    myVal += __shfl(myVal, warpIndex + 16);
    myVal += __shfl(myVal, warpIndex + 8);
    myVal += __shfl(myVal, warpIndex + 4);
    myVal += __shfl(myVal, warpIndex + 2);
    myVal += __shfl(myVal, warpIndex + 1);

    return myVal;
}


template<int x_dim, int y_dim>
__device__ void warp_reduce(volatile float smem[y_dim][x_dim])
{

        smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 32];
        smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 16];
        smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 8];
        smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 4];
        smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 2];
        smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 1];


}

template<int threads>
__global__ void reduce_dynamic(float* in, float* out, int n, int start_adr, int num_blocks)
{

    __shared__ float smem[1][threads];

    int tid = threadIdx.x + start_adr;

    float sum = 0.0f;

    // tail part
    int mult = 0;
    for(int i = 1; mult + tid < n; i++)
    {

        sum += in[tid + mult];
        mult = int_mult(i,threads);
    }

    // previously reduced part
    mult = 0;
    for(int i = 1; mult+threadIdx.x < num_blocks; i++)
    {
        sum += out[threadIdx.x + mult];
        mult = int_mult(i,threads);
    }


    if(threads == 32) smem[0][threadIdx.x+32] = 0.0f;

    smem[0][threadIdx.x] = sum;

    __syncthreads();

    if(threadIdx.x < 32)
    {
        warp_reduce<threads, 1>(smem);
    }
    if(threadIdx.x == 0)
        out[blockIdx.x] = smem[0][threadIdx.x]; // out[0] == ans


}

template<int els_per_x_group>
__global__ void reduce(float* in, float* out)
{


#if REDUCE_REGISTERS
    __shared__ float smem[NbXGroups][NbXWarps];
#else
    __shared__ float smem[NbXGroups][NbXWarps*32];
#endif

    int tid = threadIdx.x + threadIdx.y * els_per_x_group + blockIdx.x *(NbXGroups*els_per_x_group);

    float sum = 0.0f;

    const int iters = els_per_x_group/(x_threads);


#pragma unroll
        for(int i = 0; i < iters; i++)
        {
            sum += in[tid + i*x_threads];
        }


#if REDUCE_REGISTERS==0
    smem[threadIdx.y][threadIdx.x] = sum;

    __syncthreads();

    if(threadIdx.x < 32) warp_reduce<64, NbXGroups>(smem);

    __syncthreads();

#else

    float warpSum = warp_reduce_registers<x_threads, NbXGroups>( sum ) ;

    // Broadcast warp redudced sums via shared memory

    if( threadIdx.x%32 == 0 )
    smem[threadIdx.y][threadIdx.x/32] = warpSum;

    __syncthreads();

#endif


    // Final reduce
    if(threadIdx.x == 0)
    {
        sum = 0.0f;
#pragma unroll
        for(int i = 0; i < NbXGroups; i++)
        {
#if REDUCE_REGISTERS
            for(int j = 0; j < NbXWarps; j++)
                sum += smem[i][j];
#else
                sum += smem[i][0];
#endif

        }

        out[blockIdx.x] = sum; // out[0] == ans
    }


}

float cpu_sum(float* in, int num_els)
{
    float sum = 0.0f;

    for(int i = 0; i < num_els; i++)
    {
        sum += in[i];
    }

    return sum;
}


void compute_reduction(float* d_in, float* d_out, int num_els)
{


    int block_size = blockSize1;
    int num_blocks = num_els/block_size;

    int tail = num_els - num_blocks*block_size;
    int start_adr = num_els - tail;


    const int els_per_x_group = blockSize1 / NbXGroups;

    // block configuration
    dim3 blockDims(x_threads, NbXGroups);

    reduce<els_per_x_group><<< num_blocks, blockDims>>>(d_in, d_out);


    reduce_dynamic<x_threads><<< 1, x_threads>>>(d_in, d_out, num_els, start_adr, num_blocks);

}


float rel_diff(float ref, float val)
{
    float rel_diff = abs(ref-val) / abs(ref);
    return rel_diff;

}
unsigned long long int my_reduction_test(int num_els)
{


    // timers

#ifndef _WIN32
    unsigned long long int start;
    unsigned long long int delta;
#else
    double start;
    double delta;
#endif

    int testIterations = 100;

    int size = num_els*sizeof(float);

    float* d_in;
    float* d_out;

    float* d_warm1;
    float* d_warm2;


    float* in = (float*)malloc(size);
    float* out = (float*)malloc(size);


    for(int i = 0; i < num_els; i++)
    {
            in[i] = rand()&1;
    }

    cudaMalloc((void**)&d_in, size);
    cudaMalloc((void**)&d_out, size);

    cudaMalloc((void**)&d_warm1, 1024*sizeof(float));
    cudaMalloc((void**)&d_warm2, 1024*sizeof(float));

    cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);


    //time it
    start = get_clock();

    //////////////
    // real reduce
    /////////////

    for(int i = 0; i < testIterations; i++)
        compute_reduction(d_in, d_out, num_els);


    cudaThreadSynchronize();
    delta = get_clock() - start;

    float dt = float(delta)/float(testIterations);

    cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);

    cudaError_t error = cudaGetLastError();

    if(error != cudaSuccess) printf("\n Error  : %s \n", cudaGetErrorString(error));

    float throughput = num_els*sizeof(float)*float(1E-9)/(dt);
    int tail = num_els - (num_els/blockSize1)*blockSize1;
    float avg_rel_diff = rel_diff( cpu_sum(in,num_els), out[0])/(float(num_els));
    printf(" %7.0d \t %0.2f \t\t %0.2f % \t %0.1f \t\t  %s \n", num_els, throughput,
        (throughput/m_bandwidth)*100.0f, dt*float(1E6),  avg_rel_diff<float(1E-7)  ? "Pass" : "Fail");


    cudaFree(d_in);
    cudaFree(d_out);

    cudaFree(d_warm1);
    cudaFree(d_warm2);

    free(in);
    free(out);


    return delta;

}

void pause()
{

#ifdef _WIN32
    system("pause");
#else
fgetchar();
#endif


}
int main(int argc, char* argv[])
{

    int device = 0;
    cudaSetDevice(device);

    cudaDeviceProp pr;
    cudaGetDeviceProperties(&pr, device);

    m_bandwidth = ( (float)pr.memoryClockRate*2.0f*float(1E3) * (float)(pr.memoryBusWidth/8) )*float(1E-9);


printf(" %s @ %0.3f GB/s \n", pr.name, m_bandwidth );

printf("\n N \t\t [GB/s] \t [perc] \t [usec] \t test \n");

#pragma unroll
for(int i = 1024*1024; i <= 128*1024*1024; i=i*2)
{
    my_reduction_test(i);
}

printf("\n Non-base 2 tests! \n");
printf("\n N \t\t [GB/s] \t [perc] \t [usec] \t test \n");

// just some large numbers....
my_reduction_test(14*1024*1024+38);
my_reduction_test(14*1024*1024+55);
my_reduction_test(18*1024*1024+1232);
my_reduction_test(7*1024*1024+94854);


for(int i = 0; i < 4; i++)
{

    float ratio = float(rand())/float(RAND_MAX);
    ratio = ratio >= 0 ? ratio : -ratio;
    int big_num = ratio*18*1e6;

    my_reduction_test(big_num);
}


pause();


    return 0;
}