Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /*
- Copyright 2011 Andreas Kloeckner
- Copyright 2008-2011 NVIDIA Corporation
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- Derived from thrust/detail/backend/cuda/detail/fast_scan.inl
- within the Thrust project, https://code.google.com/p/thrust/
- Direct browse link:
- https://code.google.com/p/thrust/source/browse/thrust/detail/backend/cuda/detail/fast_scan.inl
- */
- #define SWG_SIZE 128
- #define K 8
- #define UWG_SIZE 256
- #define REQD_WG_SIZE(X,Y,Z) __attribute__((reqd_work_group_size(X, Y, Z)))
- #define SCAN_EXPR(a, b) a+b
- typedef uint scan_type;
- __kernel
- REQD_WG_SIZE(UWG_SIZE, 1, 1)
- void scan_final_update(
- __global scan_type *output,
- const uint N,
- const uint interval_size,
- __global scan_type *group_results)
- {
- const uint interval_begin = interval_size * get_group_id(0);
- const uint interval_end = min(interval_begin + interval_size, N);
- if (get_group_id(0) == 0)
- return;
- // value to add to this segment
- scan_type prev_group_sum = group_results[get_group_id(0) - 1];
- // advance result pointer
- output += interval_begin + get_local_id(0);
- for(uint unit_base = interval_begin;
- unit_base < interval_end;
- unit_base += UWG_SIZE, output += UWG_SIZE)
- {
- const uint i = unit_base + get_local_id(0);
- if(i < interval_end) {
- *output = SCAN_EXPR(prev_group_sum, *output);
- }
- }
- }
- void scan_group(__local scan_type *array)
- {
- scan_type val = array[get_local_id(0)];
- for (uint offset=1; offset <= SWG_SIZE; offset *= 2) {
- if (get_local_id(0) >= offset) {
- scan_type tmp = array[get_local_id(0) - offset];
- val = SCAN_EXPR(tmp, val);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- array[get_local_id(0)] = val;
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- }
- void scan_group_n(__local scan_type *array, const uint n)
- {
- scan_type val = array[get_local_id(0)];
- for (uint offset=1; offset <= SWG_SIZE; offset *= 2) {
- if (get_local_id(0) >= offset && get_local_id(0) < n) {
- scan_type tmp = array[get_local_id(0) - offset];
- val = SCAN_EXPR(tmp, val);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- array[get_local_id(0)] = val;
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- }
- __kernel
- REQD_WG_SIZE(SWG_SIZE, 1, 1)
- void scan_scan_intervals(
- __global scan_type *input,
- const uint N,
- const uint interval_size,
- __global scan_type *output,
- __global scan_type *group_results)
- {
- // padded in WG_SIZE to avoid bank conflicts
- // index K in first dimension used for carry storage
- __local scan_type ldata[K + 1][SWG_SIZE + 1];
- const uint interval_begin = interval_size * get_group_id(0);
- const uint interval_end = min(interval_begin + interval_size, N);
- const uint unit_size = K * SWG_SIZE;
- uint unit_base = interval_begin;
- for(; unit_base + unit_size <= interval_end; unit_base += unit_size) {
- // Algorithm: Each work group is responsible for one contiguous
- // 'interval', of which there are just enough to fill all compute
- // units. Intervals are split into 'units'. A unit is what gets
- // worked on in parallel by one work group.
- // Each unit has two axes--the local-id axis and the k axis.
- //
- // * * * * * * * * * * ----> lid
- // * * * * * * * * * *
- // * * * * * * * * * *
- // * * * * * * * * * *
- // * * * * * * * * * *
- // |
- // v k
- // This is a three-phase algorithm, in which first each interval
- // does its local scan, then a scan across intervals exchanges data
- // globally, and the final update adds the exchanged sums to each
- // interval.
- // Exclusive scan is realized by performing a right-shift inside
- // the final update.
- // read a unit's worth of data from global
- for(uint k = 0; k < K; k++) {
- const uint offset = k*SWG_SIZE + get_local_id(0);
- ldata[offset % K][offset / K] = input[unit_base + offset];
- }
- // carry in from previous unit, if applicable.
- if (get_local_id(0) == 0 && unit_base != interval_begin)
- ldata[0][0] = SCAN_EXPR(ldata[K][SWG_SIZE - 1], ldata[0][0]);
- barrier(CLK_LOCAL_MEM_FENCE);
- // scan along k (sequentially in each work item)
- scan_type sum = ldata[0][get_local_id(0)];
- for(uint k = 1; k < K; k++) {
- scan_type tmp = ldata[k][get_local_id(0)];
- sum = SCAN_EXPR(sum, tmp);
- ldata[k][get_local_id(0)] = sum;
- }
- // store carry in out-of-bounds (padding) array entry in the K direction
- ldata[K][get_local_id(0)] = sum;
- barrier(CLK_LOCAL_MEM_FENCE);
- // tree-based parallel scan along local id
- scan_group(&ldata[K][0]);
- // update local values
- if (get_local_id(0) > 0) {
- sum = ldata[K][get_local_id(0) - 1];
- for(uint k = 0; k < K; k++) {
- scan_type tmp = ldata[k][get_local_id(0)];
- ldata[k][get_local_id(0)] = SCAN_EXPR(sum, tmp);
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- // write data
- for(uint k = 0; k < K; k++) {
- const uint offset = k*SWG_SIZE + get_local_id(0);
- output[unit_base + offset] = ldata[offset % K][offset / K];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if (unit_base < interval_end) {
- // Algorithm: Each work group is responsible for one contiguous
- // 'interval', of which there are just enough to fill all compute
- // units. Intervals are split into 'units'. A unit is what gets
- // worked on in parallel by one work group.
- // Each unit has two axes--the local-id axis and the k axis.
- //
- // * * * * * * * * * * ----> lid
- // * * * * * * * * * *
- // * * * * * * * * * *
- // * * * * * * * * * *
- // * * * * * * * * * *
- // |
- // v k
- // This is a three-phase algorithm, in which first each interval
- // does its local scan, then a scan across intervals exchanges data
- // globally, and the final update adds the exchanged sums to each
- // interval.
- // Exclusive scan is realized by performing a right-shift inside
- // the final update.
- // read a unit's worth of data from global
- for(uint k = 0; k < K; k++) {
- const uint offset = k*SWG_SIZE + get_local_id(0);
- if (unit_base + offset < interval_end) {
- ldata[offset % K][offset / K] = input[unit_base + offset];
- }
- }
- // carry in from previous unit, if applicable.
- if (get_local_id(0) == 0 && unit_base != interval_begin)
- ldata[0][0] = SCAN_EXPR(ldata[K][SWG_SIZE - 1], ldata[0][0]);
- barrier(CLK_LOCAL_MEM_FENCE);
- // scan along k (sequentially in each work item)
- scan_type sum = ldata[0][get_local_id(0)];
- const uint offset_end = interval_end - unit_base;
- for(uint k = 1; k < K; k++) {
- if (K * get_local_id(0) + k < offset_end) {
- scan_type tmp = ldata[k][get_local_id(0)];
- sum = SCAN_EXPR(sum, tmp);
- ldata[k][get_local_id(0)] = sum;
- }
- }
- // store carry in out-of-bounds (padding) array entry in the K direction
- ldata[K][get_local_id(0)] = sum;
- barrier(CLK_LOCAL_MEM_FENCE);
- // tree-based parallel scan along local id
- scan_group_n(&ldata[K][0], offset_end / K);
- // update local values
- if (get_local_id(0) > 0) {
- sum = ldata[K][get_local_id(0) - 1];
- for(uint k = 0; k < K; k++) {
- if (K * get_local_id(0) + k < offset_end) {
- scan_type tmp = ldata[k][get_local_id(0)];
- ldata[k][get_local_id(0)] = SCAN_EXPR(sum, tmp);
- }
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- // write data
- for(uint k = 0; k < K; k++) {
- const uint offset = k*SWG_SIZE + get_local_id(0);
- if (unit_base + offset < interval_end) {
- output[unit_base + offset] = ldata[offset % K][offset / K];
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- // write interval sum
- if (get_local_id(0) == 0) {
- group_results[get_group_id(0)] = output[interval_end - 1];
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement