Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- __global__ void reduce_kernel(float* d_in, int n, float* d_out) {
- extern __shared__ float shared_mem[];
- int tid = threadIdx.x;
- int i = threadIdx.x + blockDim.x * blockIdx.x;
- if (i < len) {
- shared_mem[tid] = d_in[i];
- }
- else {
- shared_mem[tid] = 0;
- }
- __syncthreads();
- for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
- if (tid < s) {
- shared_mem[tid] += shared_mem[tid + s];
- }
- __syncthreads();
- }
- if (tid == 0) {
- d_out[blockIdx.x] = shared_mem[0];
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement