Untitled

/*
Copyright 2011 Andreas Kloeckner
Copyright 2008-2011 NVIDIA Corporation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

Derived from thrust/detail/backend/cuda/detail/fast_scan.inl
within the Thrust project, https://code.google.com/p/thrust/

Direct browse link:
https://code.google.com/p/thrust/source/browse/thrust/detail/backend/cuda/detail/fast_scan.inl
*/
#define SWG_SIZE 128
#define K 8

#define UWG_SIZE 256

#define REQD_WG_SIZE(X,Y,Z) __attribute__((reqd_work_group_size(X, Y, Z)))

#define SCAN_EXPR(a, b) a+b
typedef uint scan_type;

__kernel
REQD_WG_SIZE(UWG_SIZE, 1, 1)
void scan_final_update(
  __global scan_type *output,
  const uint N,
  const uint interval_size,
  __global scan_type *group_results)
{
  const uint interval_begin = interval_size * get_group_id(0);
  const uint interval_end   = min(interval_begin + interval_size, N);

  if (get_group_id(0) == 0)
    return;

  // value to add to this segment
  scan_type prev_group_sum = group_results[get_group_id(0) - 1];

  // advance result pointer
  output += interval_begin + get_local_id(0);

  for(uint unit_base = interval_begin;
      unit_base < interval_end;
      unit_base += UWG_SIZE, output += UWG_SIZE)
  {
    const uint i = unit_base + get_local_id(0);

    if(i < interval_end) {
      *output = SCAN_EXPR(prev_group_sum, *output);
    }
  }
}

void scan_group(__local scan_type *array)
{
  scan_type val = array[get_local_id(0)];

  for (uint offset=1; offset <= SWG_SIZE; offset *= 2) {
    if (get_local_id(0) >= offset) {
      scan_type tmp = array[get_local_id(0) - offset];
      val = SCAN_EXPR(tmp, val);
    }

    barrier(CLK_LOCAL_MEM_FENCE);
    array[get_local_id(0)] = val;
    barrier(CLK_LOCAL_MEM_FENCE);
  }
}

void scan_group_n(__local scan_type *array, const uint n)
{
  scan_type val = array[get_local_id(0)];

  for (uint offset=1; offset <= SWG_SIZE; offset *= 2) {
    if (get_local_id(0) >= offset && get_local_id(0) < n) {
      scan_type tmp = array[get_local_id(0) - offset];
      val = SCAN_EXPR(tmp, val);
    }

    barrier(CLK_LOCAL_MEM_FENCE);
    array[get_local_id(0)] = val;
    barrier(CLK_LOCAL_MEM_FENCE);
  }
}

__kernel
REQD_WG_SIZE(SWG_SIZE, 1, 1)
void scan_scan_intervals(
  __global scan_type *input,
  const uint N,
  const uint interval_size,
  __global scan_type *output,
  __global scan_type *group_results)
{
  // padded in WG_SIZE to avoid bank conflicts
  // index K in first dimension used for carry storage
  __local scan_type ldata[K + 1][SWG_SIZE + 1];

  const uint interval_begin = interval_size * get_group_id(0);
  const uint interval_end   = min(interval_begin + interval_size, N);

  const uint unit_size  = K * SWG_SIZE;

  uint unit_base = interval_begin;


  for(; unit_base + unit_size <= interval_end; unit_base += unit_size) {
    // Algorithm: Each work group is responsible for one contiguous
    // 'interval', of which there are just enough to fill all compute
    // units.  Intervals are split into 'units'. A unit is what gets
    // worked on in parallel by one work group.

    // Each unit has two axes--the local-id axis and the k axis.
    //
    // * * * * * * * * * * ----> lid
    // * * * * * * * * * *
    // * * * * * * * * * *
    // * * * * * * * * * *
    // * * * * * * * * * *
    // |
    // v k

    // This is a three-phase algorithm, in which first each interval
    // does its local scan, then a scan across intervals exchanges data
    // globally, and the final update adds the exchanged sums to each
    // interval.

    // Exclusive scan is realized by performing a right-shift inside
    // the final update.

    // read a unit's worth of data from global

    for(uint k = 0; k < K; k++) {
      const uint offset = k*SWG_SIZE + get_local_id(0);

      ldata[offset % K][offset / K] = input[unit_base + offset];
    }

    // carry in from previous unit, if applicable.
    if (get_local_id(0) == 0 && unit_base != interval_begin)
      ldata[0][0] = SCAN_EXPR(ldata[K][SWG_SIZE - 1], ldata[0][0]);

    barrier(CLK_LOCAL_MEM_FENCE);

    // scan along k (sequentially in each work item)
    scan_type sum = ldata[0][get_local_id(0)];

    for(uint k = 1; k < K; k++) {
      scan_type tmp = ldata[k][get_local_id(0)];
      sum = SCAN_EXPR(sum, tmp);
      ldata[k][get_local_id(0)] = sum;
    }

    // store carry in out-of-bounds (padding) array entry in the K direction
    ldata[K][get_local_id(0)] = sum;
    barrier(CLK_LOCAL_MEM_FENCE);

    // tree-based parallel scan along local id
    scan_group(&ldata[K][0]);

    // update local values
    if (get_local_id(0) > 0) {
      sum = ldata[K][get_local_id(0) - 1];

      for(uint k = 0; k < K; k++) {
      scan_type tmp = ldata[k][get_local_id(0)];
      ldata[k][get_local_id(0)] = SCAN_EXPR(sum, tmp);
      }
    }

    barrier(CLK_LOCAL_MEM_FENCE);

    // write data
    for(uint k = 0; k < K; k++) {
      const uint offset = k*SWG_SIZE + get_local_id(0);

      output[unit_base + offset] = ldata[offset % K][offset / K];
    }

    barrier(CLK_LOCAL_MEM_FENCE);
  }


  if (unit_base < interval_end) {
    // Algorithm: Each work group is responsible for one contiguous
    // 'interval', of which there are just enough to fill all compute
    // units.  Intervals are split into 'units'. A unit is what gets
    // worked on in parallel by one work group.

    // Each unit has two axes--the local-id axis and the k axis.
    //
    // * * * * * * * * * * ----> lid
    // * * * * * * * * * *
    // * * * * * * * * * *
    // * * * * * * * * * *
    // * * * * * * * * * *
    // |
    // v k

    // This is a three-phase algorithm, in which first each interval
    // does its local scan, then a scan across intervals exchanges data
    // globally, and the final update adds the exchanged sums to each
    // interval.

    // Exclusive scan is realized by performing a right-shift inside
    // the final update.

    // read a unit's worth of data from global

    for(uint k = 0; k < K; k++) {
      const uint offset = k*SWG_SIZE + get_local_id(0);

      if (unit_base + offset < interval_end) {
    ldata[offset % K][offset / K] = input[unit_base + offset];
      }
    }

    // carry in from previous unit, if applicable.
    if (get_local_id(0) == 0 && unit_base != interval_begin)
      ldata[0][0] = SCAN_EXPR(ldata[K][SWG_SIZE - 1], ldata[0][0]);

    barrier(CLK_LOCAL_MEM_FENCE);

    // scan along k (sequentially in each work item)
    scan_type sum = ldata[0][get_local_id(0)];

    const uint offset_end = interval_end - unit_base;

    for(uint k = 1; k < K; k++) {
      if (K * get_local_id(0) + k < offset_end) {
    scan_type tmp = ldata[k][get_local_id(0)];
    sum = SCAN_EXPR(sum, tmp);
    ldata[k][get_local_id(0)] = sum;
      }
    }

    // store carry in out-of-bounds (padding) array entry in the K direction
    ldata[K][get_local_id(0)] = sum;
    barrier(CLK_LOCAL_MEM_FENCE);

    // tree-based parallel scan along local id
    scan_group_n(&ldata[K][0], offset_end / K);

    // update local values
    if (get_local_id(0) > 0) {
      sum = ldata[K][get_local_id(0) - 1];

      for(uint k = 0; k < K; k++) {
    if (K * get_local_id(0) + k < offset_end) {
      scan_type tmp = ldata[k][get_local_id(0)];
      ldata[k][get_local_id(0)] = SCAN_EXPR(sum, tmp);
    }
      }
    }

    barrier(CLK_LOCAL_MEM_FENCE);

    // write data
    for(uint k = 0; k < K; k++) {
      const uint offset = k*SWG_SIZE + get_local_id(0);

      if (unit_base + offset < interval_end) {
    output[unit_base + offset] = ldata[offset % K][offset / K];
      }
    }

    barrier(CLK_LOCAL_MEM_FENCE);
  }


  // write interval sum
  if (get_local_id(0) == 0) {
    group_results[get_group_id(0)] = output[interval_end - 1];
  }
}