Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- __device__ void warpReduce(volatile float* sdata, int tid) {
- sdata[tid] += sdata[tid + 32];
- sdata[tid] += sdata[tid + 16];
- sdata[tid] += sdata[tid + 8];
- sdata[tid] += sdata[tid + 4];
- sdata[tid] += sdata[tid + 2];
- sdata[tid] += sdata[tid + 1];
- }
- __device__ void warpReduce2(volatile float* sdata, int tid, int blockSize) {
- if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
- if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
- if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
- if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
- if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
- if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
- }
- __global__ void reduce3(float *g_idata, float *g_odata) {
- unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
- for (unsigned int odstep = blockDim.x; odstep > 0; odstep /= 2) {
- if (tid + odstep < (1024 * 128))
- g_idata[tid] += g_idata[tid + odstep];
- __syncthreads();
- }
- if (threadIdx.x == 0) g_odata[blockIdx.x] = g_idata[blockIdx.x * blockDim.x];
- }
- __global__ void reduce4(float *g_idata, float *g_odata) {
- unsigned int tid = threadIdx.x + 2 * blockDim.x * blockIdx.x;
- for (unsigned int odstep = blockDim.x; odstep > 0; odstep /= 2) {
- if (tid + odstep < (1024 * 128))
- g_idata[tid] += g_idata[tid + odstep];
- __syncthreads();
- }
- if (threadIdx.x == 0) g_odata[blockIdx.x] = g_idata[2 * blockIdx.x * blockDim.x];
- }
- __global__ void reduce5(float *g_idata, float *g_odata) {
- unsigned int tid = threadIdx.x + 2 * blockDim.x * blockIdx.x;
- for (unsigned int odstep = blockDim.x; odstep > 32; odstep /= 2) {
- if (tid + odstep < (1024 * 128)) {
- g_idata[tid] += g_idata[tid + odstep];
- }
- __syncthreads();
- }
- if (threadIdx.x < 32)
- warpReduce(g_idata, tid);
- if (threadIdx.x == 0) g_odata[blockIdx.x] = g_idata[2 * blockIdx.x * blockDim.x];
- }
- __global__ void reduce6(float *g_idata, float *g_odata, int blockSize) {
- unsigned int tid = threadIdx.x + 2 * blockDim.x * blockIdx.x;
- if (blockSize >= 1024) {
- if (tid + 1024 < 1024*128) { g_idata[tid] += g_idata[tid + 1024]; }
- __syncthreads();
- }
- if (blockSize >= 512) {
- if (tid +512 < 1024*128) { g_idata[tid] += g_idata[tid + 512]; }
- __syncthreads();
- }
- if (blockSize >= 256) {
- if (tid +256 < 1024*128) { g_idata[tid] += g_idata[tid + 256]; }
- __syncthreads();
- }
- if (blockSize >= 128) {
- if (tid +128 < 1024*128) { g_idata[tid] += g_idata[tid + 128]; }
- __syncthreads();
- }
- if (blockSize >= 64) {
- if (tid + 64 < 1024 * 128) { g_idata[tid] += g_idata[tid + 64]; }
- __syncthreads();
- }
- if (threadIdx.x < 32) {
- warpReduce2(g_idata, tid,blockSize);
- }
- if (threadIdx.x == 0) g_odata[blockIdx.x] = g_idata[2 * blockIdx.x * blockDim.x];
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement