Untitled

#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
__kernel void reduce(__global int *in, __global int *out,
             __local int *buf, int n)
{
  //Get ids
  size_t tid = get_local_id(0);
  size_t gid = get_group_id(0);
  size_t idx = get_global_id(0);
  size_t dim = get_local_size(0);

  //Level 0
  if (n <= 512) {

    //Do an atomic add
    if (idx < n)
      atomic_add(&out[0], in[idx]);
  }
  //Level 1
  else {
    //Read to local memory
    buf[tid] = in[idx];
    barrier(CLK_LOCAL_MEM_FENCE);

    //Reduce and filter the local memories
    for (int i = dim, n = 2; i > 1; i /= 2, n *= 2) {
      if (tid % n == 0) {
        buf[tid] += buf[tid+n/2];
      }
      barrier(CLK_LOCAL_MEM_FENCE);
    }

    //Use a single worker to write back to input
    if (tid == 0) {
      in[gid] = buf[0];
    }
  }
}