Advertisement
Guest User

Untitled

a guest
Feb 25th, 2017
99
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 2.72 KB | None | 0 0
  1. __device__ void warpReduce(volatile float* sdata, int tid) {
  2.     sdata[tid] += sdata[tid + 32];
  3.     sdata[tid] += sdata[tid + 16];
  4.     sdata[tid] += sdata[tid + 8];
  5.     sdata[tid] += sdata[tid + 4];
  6.     sdata[tid] += sdata[tid + 2];
  7.     sdata[tid] += sdata[tid + 1];
  8. }
  9.  
  10. __device__ void warpReduce2(volatile float* sdata, int tid, int blockSize) {
  11.    
  12.     if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
  13.     if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
  14.     if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
  15.     if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
  16.     if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
  17.     if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
  18. }
  19.  
  20. __global__ void reduce3(float *g_idata, float *g_odata) {
  21.     unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
  22.  
  23.     for (unsigned int odstep = blockDim.x; odstep > 0; odstep /= 2) {
  24.         if (tid + odstep < (1024 * 128))
  25.             g_idata[tid] += g_idata[tid + odstep];
  26.         __syncthreads();
  27.     }
  28.     if (threadIdx.x == 0) g_odata[blockIdx.x] = g_idata[blockIdx.x * blockDim.x];
  29. }
  30.  
  31. __global__ void reduce4(float *g_idata, float *g_odata) {
  32.     unsigned int tid = threadIdx.x + 2 * blockDim.x * blockIdx.x;
  33.  
  34.     for (unsigned int odstep = blockDim.x; odstep > 0; odstep /= 2) {
  35.         if (tid + odstep < (1024 * 128))
  36.             g_idata[tid] += g_idata[tid + odstep];
  37.         __syncthreads();
  38.     }
  39.     if (threadIdx.x == 0) g_odata[blockIdx.x] = g_idata[2 * blockIdx.x * blockDim.x];
  40. }
  41.  
  42. __global__ void reduce5(float *g_idata, float *g_odata) {
  43.     unsigned int tid = threadIdx.x + 2 * blockDim.x * blockIdx.x;
  44.  
  45.     for (unsigned int odstep = blockDim.x; odstep > 32; odstep /= 2) {
  46.         if (tid + odstep < (1024 * 128)) {
  47.             g_idata[tid] += g_idata[tid + odstep];
  48.         }
  49.         __syncthreads();
  50.     }
  51.     if (threadIdx.x < 32)
  52.         warpReduce(g_idata, tid);
  53.  
  54.     if (threadIdx.x == 0) g_odata[blockIdx.x] = g_idata[2 * blockIdx.x * blockDim.x];
  55. }
  56.  
  57. __global__ void reduce6(float *g_idata, float *g_odata, int blockSize) {
  58.     unsigned int tid = threadIdx.x + 2 * blockDim.x * blockIdx.x;
  59.  
  60.     if (blockSize >= 1024) {
  61.         if (tid + 1024 < 1024*128) { g_idata[tid] += g_idata[tid + 1024]; }
  62.         __syncthreads();
  63.     }
  64.  
  65.     if (blockSize >= 512) {
  66.         if (tid +512 < 1024*128) { g_idata[tid] += g_idata[tid + 512]; }
  67.         __syncthreads();
  68.     }
  69.  
  70.     if (blockSize >= 256) {
  71.         if (tid +256 < 1024*128) { g_idata[tid] += g_idata[tid + 256]; }
  72.         __syncthreads();
  73.     }
  74.  
  75.     if (blockSize >= 128) {
  76.         if (tid +128 < 1024*128) { g_idata[tid] += g_idata[tid + 128]; }
  77.         __syncthreads();
  78.     }
  79.    
  80.     if (blockSize >= 64) {
  81.         if (tid + 64 < 1024 * 128) { g_idata[tid] += g_idata[tid + 64]; }
  82.         __syncthreads();
  83.     }
  84.  
  85.     if (threadIdx.x < 32) {
  86.         warpReduce2(g_idata, tid,blockSize);
  87.     }
  88.      
  89.  
  90.     if (threadIdx.x == 0) g_odata[blockIdx.x] = g_idata[2 * blockIdx.x * blockDim.x];
  91. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement