Advertisement
Guest User

Untitled

a guest
Jul 2nd, 2015
561
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 7.75 KB | None | 0 0
  1. /*
  2.     Fast GPU based reduction sum.
  3.     Written by Jimmy Pettersson
  4.     jimmy.pettersson@hpcsweden.se
  5.  
  6. */
  7.  
  8. #include <stdio.h>
  9. #include <stdlib.h>
  10. #include <cuda.h>
  11. #include <math.h>
  12.  
  13.  
  14. #ifndef _WIN32
  15. #include <unistd.h>
  16. #include <sys/time.h>
  17. #endif
  18.  
  19. #ifdef _WIN32
  20. #include <windows.h>
  21. #include <winbase.h>
  22. #endif
  23.  
  24. #if __DEVICE_EMULATION__
  25. #define DEBUG_SYNC __syncthreads();
  26. #else
  27. #define DEBUG_SYNC
  28. #endif
  29.  
  30. #if (__CUDA_ARCH__ < 200)
  31. #define int_mult(x,y)   __mul24(x,y)   
  32. #else
  33. #define int_mult(x,y)   x*y
  34. #endif
  35.  
  36.  
  37. #if (__CUDA_ARCH__ >= 300)
  38. #define REDUCE_REGISTERS 1 // Use __shfl instruction
  39. #else
  40. #define REDUCE_REGISTERS 0 // use shared memory
  41. #endif
  42.  
  43.  
  44. const int blockSize1 = 16384; // 32768
  45.  
  46.  
  47. const int x_threads = 64;
  48. const int NbXGroups = 4;
  49.  
  50. const int NbXWarps = x_threads/32;
  51.  
  52. float m_bandwidth = -1.0;
  53.  
  54. #ifdef _WIN32
  55. double get_clock()
  56. {
  57.    
  58.     LARGE_INTEGER ticksPerSecond;
  59.     LARGE_INTEGER timeStamp;
  60.  
  61.     QueryPerformanceFrequency(&ticksPerSecond);
  62.     QueryPerformanceCounter(&timeStamp);
  63.  
  64.     double sec = double(timeStamp.QuadPart)/(double)ticksPerSecond.QuadPart;
  65.  
  66.     return sec; // returns timestamp in secounds
  67.  
  68. }
  69. #else
  70. double get_clock()
  71. {
  72.    
  73. unsigned long long int get_clock()
  74. {
  75.     struct timeval tv;
  76.     gettimeofday(&tv, NULL);
  77.     return (unsigned long long int)tv.tv_usec + 1000000*tv.tv_sec;
  78.  
  79. }
  80.  
  81. }
  82. #endif
  83.  
  84.  
  85. template<int x_dim, int y_dim>
  86. __device__ float warp_reduce_registers(float myVal)
  87. {
  88.  
  89.     int warpIndex = threadIdx.x%32;
  90.  
  91.     myVal += __shfl(myVal, warpIndex + 16);
  92.     myVal += __shfl(myVal, warpIndex + 8);
  93.     myVal += __shfl(myVal, warpIndex + 4);
  94.     myVal += __shfl(myVal, warpIndex + 2);
  95.     myVal += __shfl(myVal, warpIndex + 1);
  96.  
  97.     return myVal;
  98. }
  99.  
  100.  
  101.  
  102. template<int x_dim, int y_dim>
  103. __device__ void warp_reduce(volatile float smem[y_dim][x_dim])
  104. {
  105.    
  106.         smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 32];
  107.         smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 16];   
  108.         smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 8];  
  109.         smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 4];  
  110.         smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 2];  
  111.         smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 1];  
  112.  
  113.  
  114. }
  115.  
  116. template<int threads>
  117. __global__ void reduce_dynamic(float* in, float* out, int n, int start_adr, int num_blocks)
  118. {
  119.  
  120.     __shared__ float smem[1][threads];
  121.  
  122.     int tid = threadIdx.x + start_adr;
  123.  
  124.     float sum = 0.0f;
  125.  
  126.     // tail part
  127.     int mult = 0;
  128.     for(int i = 1; mult + tid < n; i++)
  129.     {
  130.        
  131.         sum += in[tid + mult];
  132.         mult = int_mult(i,threads);
  133.     }
  134.  
  135.     // previously reduced part
  136.     mult = 0;
  137.     for(int i = 1; mult+threadIdx.x < num_blocks; i++)
  138.     {
  139.         sum += out[threadIdx.x + mult];
  140.         mult = int_mult(i,threads);
  141.     }
  142.  
  143.  
  144.     if(threads == 32) smem[0][threadIdx.x+32] = 0.0f;
  145.        
  146.     smem[0][threadIdx.x] = sum;
  147.  
  148.     __syncthreads();
  149.  
  150.     if(threadIdx.x < 32)
  151.     {
  152.         warp_reduce<threads, 1>(smem);
  153.     }
  154.     if(threadIdx.x == 0)
  155.         out[blockIdx.x] = smem[0][threadIdx.x]; // out[0] == ans
  156.  
  157.  
  158. }
  159.  
  160. template<int els_per_x_group>
  161. __global__ void reduce(float* in, float* out)
  162. {
  163.  
  164.    
  165. #if REDUCE_REGISTERS
  166.     __shared__ float smem[NbXGroups][NbXWarps];
  167. #else
  168.     __shared__ float smem[NbXGroups][NbXWarps*32];
  169. #endif
  170.  
  171.     int tid = threadIdx.x + threadIdx.y * els_per_x_group + blockIdx.x *(NbXGroups*els_per_x_group);
  172.  
  173.     float sum = 0.0f;
  174.  
  175.     const int iters = els_per_x_group/(x_threads);
  176.    
  177.    
  178. #pragma unroll
  179.         for(int i = 0; i < iters; i++)
  180.         {
  181.             sum += in[tid + i*x_threads];      
  182.         }
  183.    
  184.    
  185.  
  186. #if REDUCE_REGISTERS==0
  187.     smem[threadIdx.y][threadIdx.x] = sum;
  188.    
  189.     __syncthreads();
  190.  
  191.     if(threadIdx.x < 32) warp_reduce<64, NbXGroups>(smem);
  192.  
  193.     __syncthreads();
  194.  
  195. #else
  196.  
  197.     float warpSum = warp_reduce_registers<x_threads, NbXGroups>( sum ) ;
  198.    
  199.     // Broadcast warp redudced sums via shared memory
  200.    
  201.     if( threadIdx.x%32 == 0 )
  202.     smem[threadIdx.y][threadIdx.x/32] = warpSum;
  203.    
  204.     __syncthreads();
  205.  
  206. #endif
  207.  
  208.    
  209.    
  210.     // Final reduce
  211.     if(threadIdx.x == 0)
  212.     {
  213.         sum = 0.0f;
  214. #pragma unroll
  215.         for(int i = 0; i < NbXGroups; i++)
  216.         {
  217. #if REDUCE_REGISTERS
  218.             for(int j = 0; j < NbXWarps; j++)
  219.                 sum += smem[i][j];
  220. #else
  221.                 sum += smem[i][0];
  222. #endif
  223.        
  224.         }
  225.  
  226.         out[blockIdx.x] = sum; // out[0] == ans
  227.     }
  228.  
  229.  
  230.  
  231. }
  232.  
  233. float cpu_sum(float* in, int num_els)
  234. {
  235.     float sum = 0.0f;
  236.    
  237.     for(int i = 0; i < num_els; i++)
  238.     {
  239.         sum += in[i];  
  240.     }
  241.    
  242.     return sum;
  243. }
  244.  
  245.  
  246. void compute_reduction(float* d_in, float* d_out, int num_els)
  247. {
  248.  
  249.    
  250.  
  251.     int block_size = blockSize1;
  252.     int num_blocks = num_els/block_size;
  253.    
  254.     int tail = num_els - num_blocks*block_size;
  255.     int start_adr = num_els - tail;
  256.  
  257.  
  258.     const int els_per_x_group = blockSize1 / NbXGroups;
  259.  
  260.     // block configuration
  261.     dim3 blockDims(x_threads, NbXGroups);
  262.  
  263.     reduce<els_per_x_group><<< num_blocks, blockDims>>>(d_in, d_out);
  264.    
  265.  
  266.  
  267.     reduce_dynamic<x_threads><<< 1, x_threads>>>(d_in, d_out, num_els, start_adr, num_blocks);
  268.    
  269. }
  270.  
  271.  
  272.  
  273. float rel_diff(float ref, float val)
  274. {
  275.     float rel_diff = abs(ref-val) / abs(ref);
  276.     return rel_diff;
  277.  
  278. }
  279. unsigned long long int my_reduction_test(int num_els)
  280. {
  281.  
  282.    
  283.     // timers
  284.  
  285. #ifndef _WIN32
  286.     unsigned long long int start;
  287.     unsigned long long int delta;
  288. #else
  289.     double start;
  290.     double delta;
  291. #endif
  292.  
  293.     int testIterations = 100;
  294.  
  295.     int size = num_els*sizeof(float);
  296.  
  297.     float* d_in;
  298.     float* d_out;
  299.  
  300.     float* d_warm1;
  301.     float* d_warm2;
  302.  
  303.  
  304.     float* in = (float*)malloc(size);
  305.     float* out = (float*)malloc(size);
  306.  
  307.    
  308.     for(int i = 0; i < num_els; i++)
  309.     {
  310.             in[i] = rand()&1;
  311.     }
  312.    
  313.     cudaMalloc((void**)&d_in, size);
  314.     cudaMalloc((void**)&d_out, size);
  315.  
  316.     cudaMalloc((void**)&d_warm1, 1024*sizeof(float));
  317.     cudaMalloc((void**)&d_warm2, 1024*sizeof(float));
  318.  
  319.     cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
  320.  
  321.    
  322.    
  323.     //time it
  324.     start = get_clock();
  325.  
  326.     //////////////
  327.     // real reduce
  328.     /////////////
  329.  
  330.     for(int i = 0; i < testIterations; i++)
  331.         compute_reduction(d_in, d_out, num_els);
  332.  
  333.  
  334.     cudaThreadSynchronize();
  335.     delta = get_clock() - start;
  336.    
  337.     float dt = float(delta)/float(testIterations);
  338.  
  339.     cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);
  340.  
  341.     cudaError_t error = cudaGetLastError();
  342.  
  343.     if(error != cudaSuccess) printf("\n Error  : %s \n", cudaGetErrorString(error));
  344.    
  345.     float throughput = num_els*sizeof(float)*float(1E-9)/(dt);
  346.     int tail = num_els - (num_els/blockSize1)*blockSize1;
  347.     float avg_rel_diff = rel_diff( cpu_sum(in,num_els), out[0])/(float(num_els));
  348.     printf(" %7.0d \t %0.2f \t\t %0.2f % \t %0.1f \t\t  %s \n", num_els, throughput,
  349.         (throughput/m_bandwidth)*100.0f, dt*float(1E6),  avg_rel_diff<float(1E-7)  ? "Pass" : "Fail");
  350.  
  351.    
  352.  
  353.     cudaFree(d_in);
  354.     cudaFree(d_out);
  355.  
  356.     cudaFree(d_warm1);
  357.     cudaFree(d_warm2);
  358.  
  359.     free(in);
  360.     free(out);
  361.  
  362.  
  363.     return delta;
  364.  
  365. }
  366.  
  367. void pause()
  368. {
  369.  
  370. #ifdef _WIN32
  371.     system("pause");
  372. #else
  373. fgetchar();
  374. #endif
  375.  
  376.  
  377. }
  378. int main(int argc, char* argv[])
  379. {
  380.  
  381.     int device = 0;
  382.     cudaSetDevice(device);
  383.  
  384.     cudaDeviceProp pr;
  385.     cudaGetDeviceProperties(&pr, device);
  386.  
  387.     m_bandwidth = ( (float)pr.memoryClockRate*2.0f*float(1E3) * (float)(pr.memoryBusWidth/8) )*float(1E-9);
  388.  
  389.  
  390. printf(" %s @ %0.3f GB/s \n", pr.name, m_bandwidth );  
  391.  
  392. printf("\n N \t\t [GB/s] \t [perc] \t [usec] \t test \n");
  393.  
  394. #pragma unroll
  395. for(int i = 1024*1024; i <= 128*1024*1024; i=i*2)
  396. {
  397.     my_reduction_test(i);
  398. }
  399.  
  400. printf("\n Non-base 2 tests! \n");
  401. printf("\n N \t\t [GB/s] \t [perc] \t [usec] \t test \n");
  402.  
  403. // just some large numbers....
  404. my_reduction_test(14*1024*1024+38);
  405. my_reduction_test(14*1024*1024+55);
  406. my_reduction_test(18*1024*1024+1232);
  407. my_reduction_test(7*1024*1024+94854);
  408.  
  409.  
  410. for(int i = 0; i < 4; i++)
  411. {
  412.  
  413.     float ratio = float(rand())/float(RAND_MAX);
  414.     ratio = ratio >= 0 ? ratio : -ratio;
  415.     int big_num = ratio*18*1e6;
  416.  
  417.     my_reduction_test(big_num);
  418. }
  419.  
  420.  
  421.  
  422. pause();
  423.  
  424.  
  425.  
  426.     return 0;
  427. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement