Guest User

Untitled

a guest
Jul 2nd, 2015
600
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 7.75 KB | None | 0 0
  1. /*
  2.     Fast GPU based reduction sum.
  3.     Written by Jimmy Pettersson
  4.  
  5. */
  6.  
  7. #include <stdio.h>
  8. #include <stdlib.h>
  9. #include <cuda.h>
  10. #include <math.h>
  11.  
  12.  
  13. #ifndef _WIN32
  14. #include <unistd.h>
  15. #include <sys/time.h>
  16. #endif
  17.  
  18. #ifdef _WIN32
  19. #include <windows.h>
  20. #include <winbase.h>
  21. #endif
  22.  
  23. #if __DEVICE_EMULATION__
  24. #define DEBUG_SYNC __syncthreads();
  25. #else
  26. #define DEBUG_SYNC
  27. #endif
  28.  
  29. #if (__CUDA_ARCH__ < 200)
  30. #define int_mult(x,y)   __mul24(x,y)   
  31. #else
  32. #define int_mult(x,y)   x*y
  33. #endif
  34.  
  35.  
  36. #if (__CUDA_ARCH__ >= 300)
  37. #define REDUCE_REGISTERS 1 // Use __shfl instruction
  38. #else
  39. #define REDUCE_REGISTERS 0 // use shared memory
  40. #endif
  41.  
  42.  
  43. const int blockSize1 = 16384; // 32768
  44.  
  45.  
  46. const int x_threads = 64;
  47. const int NbXGroups = 4;
  48.  
  49. const int NbXWarps = x_threads/32;
  50.  
  51. float m_bandwidth = -1.0;
  52.  
  53. #ifdef _WIN32
  54. double get_clock()
  55. {
  56.    
  57.     LARGE_INTEGER ticksPerSecond;
  58.     LARGE_INTEGER timeStamp;
  59.  
  60.     QueryPerformanceFrequency(&ticksPerSecond);
  61.     QueryPerformanceCounter(&timeStamp);
  62.  
  63.     double sec = double(timeStamp.QuadPart)/(double)ticksPerSecond.QuadPart;
  64.  
  65.     return sec; // returns timestamp in secounds
  66.  
  67. }
  68. #else
  69. double get_clock()
  70. {
  71.    
  72. unsigned long long int get_clock()
  73. {
  74.     struct timeval tv;
  75.     gettimeofday(&tv, NULL);
  76.     return (unsigned long long int)tv.tv_usec + 1000000*tv.tv_sec;
  77.  
  78. }
  79.  
  80. }
  81. #endif
  82.  
  83.  
  84. template<int x_dim, int y_dim>
  85. __device__ float warp_reduce_registers(float myVal)
  86. {
  87.  
  88.     int warpIndex = threadIdx.x%32;
  89.  
  90.     myVal += __shfl(myVal, warpIndex + 16);
  91.     myVal += __shfl(myVal, warpIndex + 8);
  92.     myVal += __shfl(myVal, warpIndex + 4);
  93.     myVal += __shfl(myVal, warpIndex + 2);
  94.     myVal += __shfl(myVal, warpIndex + 1);
  95.  
  96.     return myVal;
  97. }
  98.  
  99.  
  100.  
  101. template<int x_dim, int y_dim>
  102. __device__ void warp_reduce(volatile float smem[y_dim][x_dim])
  103. {
  104.    
  105.         smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 32];
  106.         smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 16];   
  107.         smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 8];  
  108.         smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 4];  
  109.         smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 2];  
  110.         smem[threadIdx.y][threadIdx.x] += smem[threadIdx.y][threadIdx.x + 1];  
  111.  
  112.  
  113. }
  114.  
  115. template<int threads>
  116. __global__ void reduce_dynamic(float* in, float* out, int n, int start_adr, int num_blocks)
  117. {
  118.  
  119.     __shared__ float smem[1][threads];
  120.  
  121.     int tid = threadIdx.x + start_adr;
  122.  
  123.     float sum = 0.0f;
  124.  
  125.     // tail part
  126.     int mult = 0;
  127.     for(int i = 1; mult + tid < n; i++)
  128.     {
  129.        
  130.         sum += in[tid + mult];
  131.         mult = int_mult(i,threads);
  132.     }
  133.  
  134.     // previously reduced part
  135.     mult = 0;
  136.     for(int i = 1; mult+threadIdx.x < num_blocks; i++)
  137.     {
  138.         sum += out[threadIdx.x + mult];
  139.         mult = int_mult(i,threads);
  140.     }
  141.  
  142.  
  143.     if(threads == 32) smem[0][threadIdx.x+32] = 0.0f;
  144.        
  145.     smem[0][threadIdx.x] = sum;
  146.  
  147.     __syncthreads();
  148.  
  149.     if(threadIdx.x < 32)
  150.     {
  151.         warp_reduce<threads, 1>(smem);
  152.     }
  153.     if(threadIdx.x == 0)
  154.         out[blockIdx.x] = smem[0][threadIdx.x]; // out[0] == ans
  155.  
  156.  
  157. }
  158.  
  159. template<int els_per_x_group>
  160. __global__ void reduce(float* in, float* out)
  161. {
  162.  
  163.    
  164. #if REDUCE_REGISTERS
  165.     __shared__ float smem[NbXGroups][NbXWarps];
  166. #else
  167.     __shared__ float smem[NbXGroups][NbXWarps*32];
  168. #endif
  169.  
  170.     int tid = threadIdx.x + threadIdx.y * els_per_x_group + blockIdx.x *(NbXGroups*els_per_x_group);
  171.  
  172.     float sum = 0.0f;
  173.  
  174.     const int iters = els_per_x_group/(x_threads);
  175.    
  176.    
  177. #pragma unroll
  178.         for(int i = 0; i < iters; i++)
  179.         {
  180.             sum += in[tid + i*x_threads];      
  181.         }
  182.    
  183.    
  184.  
  185. #if REDUCE_REGISTERS==0
  186.     smem[threadIdx.y][threadIdx.x] = sum;
  187.    
  188.     __syncthreads();
  189.  
  190.     if(threadIdx.x < 32) warp_reduce<64, NbXGroups>(smem);
  191.  
  192.     __syncthreads();
  193.  
  194. #else
  195.  
  196.     float warpSum = warp_reduce_registers<x_threads, NbXGroups>( sum ) ;
  197.    
  198.     // Broadcast warp redudced sums via shared memory
  199.    
  200.     if( threadIdx.x%32 == 0 )
  201.     smem[threadIdx.y][threadIdx.x/32] = warpSum;
  202.    
  203.     __syncthreads();
  204.  
  205. #endif
  206.  
  207.    
  208.    
  209.     // Final reduce
  210.     if(threadIdx.x == 0)
  211.     {
  212.         sum = 0.0f;
  213. #pragma unroll
  214.         for(int i = 0; i < NbXGroups; i++)
  215.         {
  216. #if REDUCE_REGISTERS
  217.             for(int j = 0; j < NbXWarps; j++)
  218.                 sum += smem[i][j];
  219. #else
  220.                 sum += smem[i][0];
  221. #endif
  222.        
  223.         }
  224.  
  225.         out[blockIdx.x] = sum; // out[0] == ans
  226.     }
  227.  
  228.  
  229.  
  230. }
  231.  
  232. float cpu_sum(float* in, int num_els)
  233. {
  234.     float sum = 0.0f;
  235.    
  236.     for(int i = 0; i < num_els; i++)
  237.     {
  238.         sum += in[i];  
  239.     }
  240.    
  241.     return sum;
  242. }
  243.  
  244.  
  245. void compute_reduction(float* d_in, float* d_out, int num_els)
  246. {
  247.  
  248.    
  249.  
  250.     int block_size = blockSize1;
  251.     int num_blocks = num_els/block_size;
  252.    
  253.     int tail = num_els - num_blocks*block_size;
  254.     int start_adr = num_els - tail;
  255.  
  256.  
  257.     const int els_per_x_group = blockSize1 / NbXGroups;
  258.  
  259.     // block configuration
  260.     dim3 blockDims(x_threads, NbXGroups);
  261.  
  262.     reduce<els_per_x_group><<< num_blocks, blockDims>>>(d_in, d_out);
  263.    
  264.  
  265.  
  266.     reduce_dynamic<x_threads><<< 1, x_threads>>>(d_in, d_out, num_els, start_adr, num_blocks);
  267.    
  268. }
  269.  
  270.  
  271.  
  272. float rel_diff(float ref, float val)
  273. {
  274.     float rel_diff = abs(ref-val) / abs(ref);
  275.     return rel_diff;
  276.  
  277. }
  278. unsigned long long int my_reduction_test(int num_els)
  279. {
  280.  
  281.    
  282.     // timers
  283.  
  284. #ifndef _WIN32
  285.     unsigned long long int start;
  286.     unsigned long long int delta;
  287. #else
  288.     double start;
  289.     double delta;
  290. #endif
  291.  
  292.     int testIterations = 100;
  293.  
  294.     int size = num_els*sizeof(float);
  295.  
  296.     float* d_in;
  297.     float* d_out;
  298.  
  299.     float* d_warm1;
  300.     float* d_warm2;
  301.  
  302.  
  303.     float* in = (float*)malloc(size);
  304.     float* out = (float*)malloc(size);
  305.  
  306.    
  307.     for(int i = 0; i < num_els; i++)
  308.     {
  309.             in[i] = rand()&1;
  310.     }
  311.    
  312.     cudaMalloc((void**)&d_in, size);
  313.     cudaMalloc((void**)&d_out, size);
  314.  
  315.     cudaMalloc((void**)&d_warm1, 1024*sizeof(float));
  316.     cudaMalloc((void**)&d_warm2, 1024*sizeof(float));
  317.  
  318.     cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
  319.  
  320.    
  321.    
  322.     //time it
  323.     start = get_clock();
  324.  
  325.     //////////////
  326.     // real reduce
  327.     /////////////
  328.  
  329.     for(int i = 0; i < testIterations; i++)
  330.         compute_reduction(d_in, d_out, num_els);
  331.  
  332.  
  333.     cudaThreadSynchronize();
  334.     delta = get_clock() - start;
  335.    
  336.     float dt = float(delta)/float(testIterations);
  337.  
  338.     cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);
  339.  
  340.     cudaError_t error = cudaGetLastError();
  341.  
  342.     if(error != cudaSuccess) printf("\n Error  : %s \n", cudaGetErrorString(error));
  343.    
  344.     float throughput = num_els*sizeof(float)*float(1E-9)/(dt);
  345.     int tail = num_els - (num_els/blockSize1)*blockSize1;
  346.     float avg_rel_diff = rel_diff( cpu_sum(in,num_els), out[0])/(float(num_els));
  347.     printf(" %7.0d \t %0.2f \t\t %0.2f % \t %0.1f \t\t  %s \n", num_els, throughput,
  348.         (throughput/m_bandwidth)*100.0f, dt*float(1E6),  avg_rel_diff<float(1E-7)  ? "Pass" : "Fail");
  349.  
  350.    
  351.  
  352.     cudaFree(d_in);
  353.     cudaFree(d_out);
  354.  
  355.     cudaFree(d_warm1);
  356.     cudaFree(d_warm2);
  357.  
  358.     free(in);
  359.     free(out);
  360.  
  361.  
  362.     return delta;
  363.  
  364. }
  365.  
  366. void pause()
  367. {
  368.  
  369. #ifdef _WIN32
  370.     system("pause");
  371. #else
  372. fgetchar();
  373. #endif
  374.  
  375.  
  376. }
  377. int main(int argc, char* argv[])
  378. {
  379.  
  380.     int device = 0;
  381.     cudaSetDevice(device);
  382.  
  383.     cudaDeviceProp pr;
  384.     cudaGetDeviceProperties(&pr, device);
  385.  
  386.     m_bandwidth = ( (float)pr.memoryClockRate*2.0f*float(1E3) * (float)(pr.memoryBusWidth/8) )*float(1E-9);
  387.  
  388.  
  389. printf(" %s @ %0.3f GB/s \n", pr.name, m_bandwidth );  
  390.  
  391. printf("\n N \t\t [GB/s] \t [perc] \t [usec] \t test \n");
  392.  
  393. #pragma unroll
  394. for(int i = 1024*1024; i <= 128*1024*1024; i=i*2)
  395. {
  396.     my_reduction_test(i);
  397. }
  398.  
  399. printf("\n Non-base 2 tests! \n");
  400. printf("\n N \t\t [GB/s] \t [perc] \t [usec] \t test \n");
  401.  
  402. // just some large numbers....
  403. my_reduction_test(14*1024*1024+38);
  404. my_reduction_test(14*1024*1024+55);
  405. my_reduction_test(18*1024*1024+1232);
  406. my_reduction_test(7*1024*1024+94854);
  407.  
  408.  
  409. for(int i = 0; i < 4; i++)
  410. {
  411.  
  412.     float ratio = float(rand())/float(RAND_MAX);
  413.     ratio = ratio >= 0 ? ratio : -ratio;
  414.     int big_num = ratio*18*1e6;
  415.  
  416.     my_reduction_test(big_num);
  417. }
  418.  
  419.  
  420.  
  421. pause();
  422.  
  423.  
  424.  
  425.     return 0;
  426. }
Add Comment
Please, Sign In to add comment