Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- template <unsigned int blockSize>
- __global__ void reduce4(int *g_odata, int *g_odata, unsigned int n) {
- unsigned int tid = threadIdx.x;
- unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
- unsigned int gridSize = blockSize * 2 * gridDim.x;
- // reduce multiple elements per thread
- g_odata[tid] = 0;
- while (i < n) {
- atomicAdd(&g_odata[tid], (g_idata[i] + g_idata[i+blockSize]));
- i += gridSize;
- }
- __syncthreads();
- if (blockSize >= 512) {
- if (tid < 256)
- atomicAdd(&g_odata[tid], g_odata[tid + 256]);
- __syncthreads();
- }
- if (blockSize >= 256) {
- if (tid < 128)
- atomicAdd(&g_odata[tid], g_odata[tid + 128]);
- __syncthreads();
- }
- if (blockSize >= 128) {
- if (tid < 64)
- atomicAdd(&g_odata[tid], g_odata[tid + 64]);
- __syncthreads();
- }
- if (tid < 32) {
- if (blockSize >= 64) atomicAdd(&g_odata[tid], g_odata[tid + 32]);
- if (blockSize >= 32) atomicAdd(&g_odata[tid], g_odata[tid + 16]);
- if (blockSize >= 16) atomicAdd(&g_odata[tid], g_odata[tid + 8]);
- if (blockSize >= 8) atomicAdd(&g_odata[tid], g_odata[tid + 4]);
- if (blockSize >= 4) atomicAdd(&g_odata[tid], g_odata[tid + 2]);
- if (blockSize >= 2) atomicAdd(&g_odata[tid], g_odata[tid + 1]);
- }
- if (tid == 0)
- g_odata[blockIdx.x] = g_odata[0];
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement