Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /*
- Fast GPU based reduction sum.
- Written by Jimmy Pettersson
- jimmy.pettersson@hpcsweden.se
- */
- #include <stdio.h>
- #include <stdlib.h>
- #include <cuda.h>
- #include <math.h>
- #ifndef _WIN32
- #include <unistd.h>
- #include <sys/time.h>
- #endif
- #ifdef _WIN32
- #include <windows.h>
- #include <winbase.h>
- #endif
- #if __DEVICE_EMULATION__
- #define DEBUG_SYNC __syncthreads();
- #else
- #define DEBUG_SYNC
- #endif
- #if (__CUDA_ARCH__ < 200)
- #define int_mult(x,y) __mul24(x,y)
- #else
- #define int_mult(x,y) x*y
- #endif
- #if (__CUDA_ARCH__ >= 300)
- #define REDUCE_REGISTERS 1 // Use __shfl instruction
- #else
- #define REDUCE_REGISTERS 0 // use shared memory
- #endif
- const int blockSize1 = 16384; // 32768
- const int x_threads = 64;
- const int NbXGroups = 4;
- const int NbXWarps = x_threads/32;
- float m_bandwidth = -1.0;
- #ifdef _WIN32
- double get_clock()
- {
- LARGE_INTEGER ticksPerSecond;
- LARGE_INTEGER timeStamp;
- QueryPerformanceFrequency(&ticksPerSecond);
- QueryPerformanceCounter(&timeStamp);
- double sec = double(timeStamp.QuadPart)/(double)ticksPerSecond.QuadPart;
- return sec; // returns timestamp in secounds
- }
- #else
- double get_clock()
- {
- unsigned long long int get_clock()
- {
- struct timeval tv;
- gettimeofday(&tv, NULL);
- return (unsigned long long int)tv.tv_usec + 1000000*tv.tv_sec;
- }
- }
- #endif
- template<int x_dim, int y_dim>
- __device__ float warp_reduce_registers(float myVal)
- {
- int warpIndex = threadIdx.x%32;
- myVal += __shfl(myVal, warpIndex + 16);
- myVal += __shfl(myVal, warpIndex + 8);
- myVal += __shfl(myVal, warpIndex + 4);
- myVal += __shfl(myVal, warpIndex + 2);
- myVal += __shfl(myVal, warpIndex + 1);
- return myVal;
- }
- template<int x_dim, int y_dim>
- __device__ void warp_reduce(volatile float smem[y_dim][x_dim])
- {
- smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 32];
- smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 16];
- smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 8];
- smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 4];
- smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 2];
- smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 1];
- }
- template<int threads>
- __global__ void reduce_dynamic(float* in, float* out, int n, int start_adr, int num_blocks)
- {
- __shared__ float smem[1][threads];
- int tid = threadIdx.x + start_adr;
- float sum = 0.0f;
- // tail part
- int mult = 0;
- for(int i = 1; mult + tid < n; i++)
- {
- sum += in[tid + mult];
- mult = int_mult(i,threads);
- }
- // previously reduced part
- mult = 0;
- for(int i = 1; mult+threadIdx.x < num_blocks; i++)
- {
- sum += out[threadIdx.x + mult];
- mult = int_mult(i,threads);
- }
- if(threads == 32) smem[0][threadIdx.x+32] = 0.0f;
- smem[0][threadIdx.x] = sum;
- __syncthreads();
- if(threadIdx.x < 32)
- {
- warp_reduce<threads, 1>(smem);
- }
- if(threadIdx.x == 0)
- out[blockIdx.x] = smem[0][threadIdx.x]; // out[0] == ans
- }
- template<int els_per_x_group>
- __global__ void reduce(float* in, float* out)
- {
- #if REDUCE_REGISTERS
- __shared__ float smem[NbXGroups][NbXWarps];
- #else
- __shared__ float smem[NbXGroups][NbXWarps*32];
- #endif
- int tid = threadIdx.x + threadIdx.y * els_per_x_group + blockIdx.x *(NbXGroups*els_per_x_group);
- float sum = 0.0f;
- const int iters = els_per_x_group/(x_threads);
- #pragma unroll
- for(int i = 0; i < iters; i++)
- {
- sum += in[tid + i*x_threads];
- }
- #if REDUCE_REGISTERS==0
- smem[threadIdx.y][threadIdx.x] = sum;
- __syncthreads();
- if(threadIdx.x < 32) warp_reduce<64, NbXGroups>(smem);
- __syncthreads();
- #else
- float warpSum = warp_reduce_registers<x_threads, NbXGroups>( sum ) ;
- // Broadcast warp redudced sums via shared memory
- if( threadIdx.x%32 == 0 )
- smem[threadIdx.y][threadIdx.x/32] = warpSum;
- __syncthreads();
- #endif
- // Final reduce
- if(threadIdx.x == 0)
- {
- sum = 0.0f;
- #pragma unroll
- for(int i = 0; i < NbXGroups; i++)
- {
- #if REDUCE_REGISTERS
- for(int j = 0; j < NbXWarps; j++)
- sum += smem[i][j];
- #else
- sum += smem[i][0];
- #endif
- }
- out[blockIdx.x] = sum; // out[0] == ans
- }
- }
- float cpu_sum(float* in, int num_els)
- {
- float sum = 0.0f;
- for(int i = 0; i < num_els; i++)
- {
- sum += in[i];
- }
- return sum;
- }
- void compute_reduction(float* d_in, float* d_out, int num_els)
- {
- int block_size = blockSize1;
- int num_blocks = num_els/block_size;
- int tail = num_els - num_blocks*block_size;
- int start_adr = num_els - tail;
- const int els_per_x_group = blockSize1 / NbXGroups;
- // block configuration
- dim3 blockDims(x_threads, NbXGroups);
- reduce<els_per_x_group><<< num_blocks, blockDims>>>(d_in, d_out);
- reduce_dynamic<x_threads><<< 1, x_threads>>>(d_in, d_out, num_els, start_adr, num_blocks);
- }
- float rel_diff(float ref, float val)
- {
- float rel_diff = abs(ref-val) / abs(ref);
- return rel_diff;
- }
- unsigned long long int my_reduction_test(int num_els)
- {
- // timers
- #ifndef _WIN32
- unsigned long long int start;
- unsigned long long int delta;
- #else
- double start;
- double delta;
- #endif
- int testIterations = 100;
- int size = num_els*sizeof(float);
- float* d_in;
- float* d_out;
- float* d_warm1;
- float* d_warm2;
- float* in = (float*)malloc(size);
- float* out = (float*)malloc(size);
- for(int i = 0; i < num_els; i++)
- {
- in[i] = rand()&1;
- }
- cudaMalloc((void**)&d_in, size);
- cudaMalloc((void**)&d_out, size);
- cudaMalloc((void**)&d_warm1, 1024*sizeof(float));
- cudaMalloc((void**)&d_warm2, 1024*sizeof(float));
- cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
- //time it
- start = get_clock();
- //////////////
- // real reduce
- /////////////
- for(int i = 0; i < testIterations; i++)
- compute_reduction(d_in, d_out, num_els);
- cudaThreadSynchronize();
- delta = get_clock() - start;
- float dt = float(delta)/float(testIterations);
- cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);
- cudaError_t error = cudaGetLastError();
- if(error != cudaSuccess) printf("\n Error : %s \n", cudaGetErrorString(error));
- float throughput = num_els*sizeof(float)*float(1E-9)/(dt);
- int tail = num_els - (num_els/blockSize1)*blockSize1;
- float avg_rel_diff = rel_diff( cpu_sum(in,num_els), out[0])/(float(num_els));
- printf(" %7.0d \t %0.2f \t\t %0.2f % \t %0.1f \t\t %s \n", num_els, throughput,
- (throughput/m_bandwidth)*100.0f, dt*float(1E6), avg_rel_diff<float(1E-7) ? "Pass" : "Fail");
- cudaFree(d_in);
- cudaFree(d_out);
- cudaFree(d_warm1);
- cudaFree(d_warm2);
- free(in);
- free(out);
- return delta;
- }
- void pause()
- {
- #ifdef _WIN32
- system("pause");
- #else
- fgetchar();
- #endif
- }
- int main(int argc, char* argv[])
- {
- int device = 0;
- cudaSetDevice(device);
- cudaDeviceProp pr;
- cudaGetDeviceProperties(&pr, device);
- m_bandwidth = ( (float)pr.memoryClockRate*2.0f*float(1E3) * (float)(pr.memoryBusWidth/8) )*float(1E-9);
- printf(" %s @ %0.3f GB/s \n", pr.name, m_bandwidth );
- printf("\n N \t\t [GB/s] \t [perc] \t [usec] \t test \n");
- #pragma unroll
- for(int i = 1024*1024; i <= 128*1024*1024; i=i*2)
- {
- my_reduction_test(i);
- }
- printf("\n Non-base 2 tests! \n");
- printf("\n N \t\t [GB/s] \t [perc] \t [usec] \t test \n");
- // just some large numbers....
- my_reduction_test(14*1024*1024+38);
- my_reduction_test(14*1024*1024+55);
- my_reduction_test(18*1024*1024+1232);
- my_reduction_test(7*1024*1024+94854);
- for(int i = 0; i < 4; i++)
- {
- float ratio = float(rand())/float(RAND_MAX);
- ratio = ratio >= 0 ? ratio : -ratio;
- int big_num = ratio*18*1e6;
- my_reduction_test(big_num);
- }
- pause();
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement