Untitled

/*
    COMMAND LINE ARGUMENTS

    "--n=<N>":         Specify the number of elements to reduce (default 1048576)
    "--threads=<N>":   Specify the number of threads per block (default 128)
    "--kernel=<N>":    Specify which kernel to run (0-6, default 6)
    "--maxblocks=<N>": Specify the maximum number of thread blocks to launch (kernel 6 only, default 64)
    "--cpufinal":      Read back the per-block results and do final sum of block sums on CPU (default false)
    "--cputhresh=<N>": The threshold of number of blocks sums below which to perform a CPU final reduction (default 1)
*/

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <helper_functions.h>
#include <algorithm>
#include "redAtom.h"

  /////////////////
 // warp reduce //
/////////////////
/* warp-synchronous reduction using volatile memory
 * to prevent instruction reordering for non-atomic
 * operations */

template <unsigned int blockSize>
__device__ void warpReduce(volatile int *sdata, int tid) {
  if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
  if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
  if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
  if (blockSize >=  8) sdata[tid] += sdata[tid + 4];
  if (blockSize >=  4) sdata[tid] += sdata[tid + 2];
  if (blockSize >=  2) sdata[tid] += sdata[tid + 1];
}

  ////////////////////////
 // atomic warp reduce //
////////////////////////
/* warp-synchronous reduction using atomic operations
 * to serialize computation */

template <unsigned int blockSize>
__device__ void atomicWarpReduce(int *sdata, int tid) {
  if (blockSize >= 64) atomicAdd(&sdata[tid], sdata[tid + 32]);
  if (blockSize >= 32) atomicAdd(&sdata[tid], sdata[tid + 16]);
  if (blockSize >= 16) atomicAdd(&sdata[tid], sdata[tid + 8]);
  if (blockSize >=  8) atomicAdd(&sdata[tid], sdata[tid + 4]);
  if (blockSize >=  4) atomicAdd(&sdata[tid], sdata[tid + 2]);
  if (blockSize >=  2) atomicAdd(&sdata[tid], sdata[tid + 1]);
}

  ////////////////////////
 // reduction kernel 0 //
////////////////////////
/* fastest reduction algorithm provided by
 * cuda/samples/6_Advanced/reduction/reduction_kernel.cu */

template <unsigned int blockSize, bool nIsPow2>
__global__ void reduce0(int *g_idata, int *g_odata, unsigned int n) {
  extern __shared__ int sdata[];
  // first level of reduction (global -> shared)
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
  unsigned int gridSize = blockSize * 2 * gridDim.x;
  int sum = 0;
  // reduce multiple elements per thread
  while (i < n) {
    sum += g_idata[i];
    // check bounds
    if (nIsPow2 || i + blockSize < n)
      sum += g_idata[i + blockSize];
    i += gridSize;
  }
  // local sum -> shared memory
  sdata[tid] = sum;
  __syncthreads();
  // reduce in shared memory
  if (blockSize >= 512) {
    if (tid < 256)
      sdata[tid] = sum = sum + sdata[tid + 256];
  __syncthreads();
  }
  if (blockSize >= 256) {
    if (tid < 128)
      sdata[tid] = sum = sum + sdata[tid + 128];
  __syncthreads();
  }
  if (blockSize >= 128) {
    if (tid < 64)
      sdata[tid] = sum = sum + sdata[tid + 64];
  __syncthreads();
  }
  if (tid < 32) {
    // warp-synchronous reduction
    // volatile memory stores won't be reordered by compiler
    volatile int *smem = sdata;
    if (blockSize >= 64)
      smem[tid] = sum = sum + smem[tid + 32];
    if (blockSize >= 32)
      smem[tid] = sum = sum + smem[tid + 16];
    if (blockSize >= 16)
      smem[tid] = sum = sum + smem[tid + 8];
    if (blockSize >= 8)
      smem[tid] = sum = sum + smem[tid + 4];
    if (blockSize >= 4)
      smem[tid] = sum = sum + smem[tid + 2];
    if (blockSize >= 2)
      smem[tid] = sum = sum + smem[tid + 1];
  }
  // write result for block to global memory
  if (tid == 0)
    g_odata[blockIdx.x] = sdata[0];
}

  /////////////////////////
 // reduction kernel 1  //
/////////////////////////
/* fastest reduction alrogithm described in
 * cuda/samples/6_Advanced/reduction/doc/reduction.pdf */

template <unsigned int blockSize>
__global__ void reduce1(int *g_idata, int *g_odata, unsigned int n) {
  extern __shared__ int sdata[];
  // first level of reduction (global -> shared)
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
  unsigned int gridSize = blockSize * 2 * gridDim.x;
  sdata[tid] = 0;
  // reduce multiple elements per thread
  while (i < n) {
    sdata[tid] += g_idata[i] + g_idata[i+blockSize];
    i += gridSize;
  }
  __syncthreads();
  // reduce in shared memory
  if (blockSize >= 512) {
    if (tid < 256)
      sdata[tid] += sdata[tid + 256];
  __syncthreads();
  }
  if (blockSize >= 256) {
    if (tid < 128)
      sdata[tid] += sdata[tid + 128];
  __syncthreads();
  }
  if (blockSize >= 128) {
    if (tid < 64)
      sdata[tid] += sdata[tid + 64];
  __syncthreads();
  }
  if (tid < 32) warpReduce<blockSize>(sdata, tid);
  // write result for block to global memory
  if (tid == 0)
    g_odata[blockIdx.x] = sdata[0];
}

  /////////////////////////
 // reduction kernel 2  //
/////////////////////////
/* reduction kernel 1 executed
 * with atomic warp-synchronous addition */

template <unsigned int blockSize>
__global__ void reduce2(int *g_idata, int *g_odata, unsigned int n) {
  extern __shared__ int sdata[];
  // first level of reduction (global -> shared)
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
  unsigned int gridSize = blockSize * 2 * gridDim.x;
  sdata[tid] = 0;
  // reduce multiple elements per thread
  while (i < n) {
    sdata[tid] += g_idata[i] + g_idata[i+blockSize];
    i += gridSize;
  }
  __syncthreads();
  // reduce in shared memory
  if (blockSize >= 512) {
    if (tid < 256)
      sdata[tid] += sdata[tid + 256];
  __syncthreads();
  }
  if (blockSize >= 256) {
    if (tid < 128)
      sdata[tid] += sdata[tid + 128];
  __syncthreads();
  }
  if (blockSize >= 128) {
    if (tid < 64)
      sdata[tid] += sdata[tid + 64];
  __syncthreads();
  }
  if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
  // write result for block to global memory
  if (tid == 0)
    g_odata[blockIdx.x] = sdata[0];
}

  /////////////////////////
 // reduction kernel 3  //
/////////////////////////

template <unsigned int blockSize>
__global__ void reduce3(int *g_idata, int *g_odata, unsigned int n) {
  extern __shared__ int sdata[];
  // first level of reduction (global -> shared)
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
  unsigned int gridSize = blockSize * 2 * gridDim.x;
  sdata[tid] = 0;
  // reduce multiple elements per thread
  while (i < n) {
    sdata[tid] += g_idata[i] + g_idata[i+blockSize];
    i += gridSize;
  }
  __syncthreads();
  // reduce in shared memory
  if (blockSize >= 512) {
    if (tid < 256)
      atomicAdd(&sdata[tid], sdata[tid + 256]);
    __syncthreads();
  }
  if (blockSize >= 256) {
    if (tid < 128)
      atomicAdd(&sdata[tid], sdata[tid + 128]);
    __syncthreads();
  }
  if (blockSize >= 128) {
    if (tid < 64)
      atomicAdd(&sdata[tid], sdata[tid + 64]);
    __syncthreads();
  }
  if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
  // write result for block to global memory
  if (tid == 0)
    g_odata[blockIdx.x] = sdata[0];
}

  /////////////////////////
 // reduction kernel 4  //
/////////////////////////

template <unsigned int blockSize>
__global__ void reduce4(int *g_idata, int *g_odata, unsigned int n) {
  extern __shared__ int sdata[];
  // first level of reduction (global -> shared)
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
  unsigned int gridSize = blockSize * 2 * gridDim.x;
  sdata[tid] = 0;
  // reduce multiple elements per thread
  while (i < n) {
    atomicAdd(&sdata[tid], (g_idata[i] + g_idata[i+blockSize]));
    i += gridSize;
  }
  __syncthreads();
  // reduce in shared memory
  if (blockSize >= 512) {
    if (tid < 256)
      atomicAdd(&sdata[tid], sdata[tid + 256]);
    __syncthreads();
  }
  if (blockSize >= 256) {
    if (tid < 128)
      atomicAdd(&sdata[tid], sdata[tid + 128]);
    __syncthreads();
  }
  if (blockSize >= 128) {
    if (tid < 64)
      atomicAdd(&sdata[tid], sdata[tid + 64]);
    __syncthreads();
  }
  if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
  // write result for block to global memory
  if (tid == 0)
    g_odata[blockIdx.x] = sdata[0];
}

  ////////////////////////
 // reduction kernel 9 //
////////////////////////
/* this reduction kernel interleaves which threads are active by using the modulo
 * operator,  this operator is very expensive on GPUs, and the interleaved
 * inactivity means that no whole warps are active, which is also very inefficient */
__global__ void
reduce9(int *g_idata, int *g_odata, unsigned int n) {
  extern __shared__ int sdata[];
  // load shared memory
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
  sdata[tid] = (i < n) ? g_idata[i] : 0;
  __syncthreads();
  // do reduction in shared memory
  for (unsigned int s=1; s < blockDim.x; s *= 2) {
    // modulo arithmetic is slow
    if ((tid % (2*s)) == 0)
      sdata[tid] += sdata[tid + s];
    __syncthreads();
  }
  // write result for this block to global mem
  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

extern "C" bool isPow2(unsigned int x);

  /////////////////////////////
 // kernel wrapper function //
/////////////////////////////
/* computes shared memory size, selects corresponding templates
 * for each reduction kernel and launches them */

void reduce(int size, int threads, int blocks,
            int kernel, int *d_idata, int *d_odata) {
  dim3 dimBlock(threads, 1, 1);
  dim3 dimGrid(blocks, 1 ,1);
  // if there is only one warp per block allocate two warps worth
  // of shared memory to avoid indexing shared memory out of bounds
  int smemSize = (threads <= 32) ? 2 * threads * sizeof(int) : threads * sizeof(int);
  // select and launch reduction kernel
  switch (kernel) {
    case 0:
      if (isPow2(size)) {
        switch (threads) {
          case 512: reduce0<512, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
          case 256: reduce0<256, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
          case 128: reduce0<128, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
          case  64: reduce0< 64, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
          case  32: reduce0< 32, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
          case  16: reduce0< 16, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
          case   8: reduce0<  8, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
          case   4: reduce0<  4, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
          case   2: reduce0<  2, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
          case   1: reduce0<  1, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
        }
      }
      else {
        switch (threads) {
          case 512: reduce0<512, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
          case 256: reduce0<256, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
          case 128: reduce0<128, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
          case  64: reduce0< 64, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
          case  32: reduce0< 32, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
          case  16: reduce0< 16, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
          case   8: reduce0<  8, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
          case   4: reduce0<  4, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
          case   2: reduce0<  2, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
          case   1: reduce0<  1, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
        }
      }
    break;
    case 1:
      switch (threads) {
        case 512: reduce1<512><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case 256: reduce1<256><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case 128: reduce1<128><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case  64: reduce1< 64><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case  32: reduce1< 32><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case  16: reduce1< 16><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case   8: reduce1<  8><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case   4: reduce1<  4><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case   2: reduce1<  2><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case   1: reduce1<  1><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
      }
    break;
    case 2:
      switch (threads) {
        case 512: reduce2<512><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case 256: reduce2<256><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case 128: reduce2<128><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case  64: reduce2< 64><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case  32: reduce2< 32><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case  16: reduce2< 16><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case   8: reduce2<  8><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case   4: reduce2<  4><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case   2: reduce2<  2><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case   1: reduce2<  1><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
      }
    break;
    case 3:
      switch (threads) {
        case 512: reduce3<512><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case 256: reduce3<256><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case 128: reduce3<128><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case  64: reduce3< 64><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case  32: reduce3< 32><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case  16: reduce3< 16><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case   8: reduce3<  8><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case   4: reduce3<  4><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case   2: reduce3<  2><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case   1: reduce3<  1><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
      }
    break;
    case 4:
      switch (threads) {
        case 512: reduce4<512><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case 256: reduce4<256><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case 128: reduce4<128><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case  64: reduce4< 64><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case  32: reduce4< 32><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case  16: reduce4< 16><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case   8: reduce4<  8><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case   4: reduce4<  4><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case   2: reduce4<  2><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
        case   1: reduce4<  1><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
      }
    break;
    case 9: reduce9<<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  }
}

  //////////////////////////
 // forward declarations //
//////////////////////////

bool runTest(int argc, char **argv);
#define MAX_BLOCK_DIM_SIZE 65535
#ifdef WIN32
#define strcasecmp strcmpi
#endif
extern "C" bool isPow2(unsigned int x) {
  return ((x&(x-1))==0);
}

  //////////////////
 // program main //
//////////////////

int main(int argc, char **argv) {
  printf("%s Starting...\n\n", argv[0]);
  cudaDeviceProp deviceProp;
  deviceProp.major = 1;
  deviceProp.minor = 0;
  int minimumComputeVersion = 10;
  int dev;
  dev = findCudaDevice(argc, (const char **)argv);
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
  if ((deviceProp.major * 10 + deviceProp.minor) >= minimumComputeVersion) {
    printf("Using Device %d: %s\n\n", dev, deviceProp.name);
    checkCudaErrors(cudaSetDevice(dev));
  }
  else {
    printf("Error: the selected device does not support the minimum compute capability of %d.%d.\n\n",
           minimumComputeVersion / 10, minimumComputeVersion % 10);
    cudaDeviceReset();
    exit(EXIT_FAILURE);
  }
  printf("Reducing Integer Array...\n\n");
  bool bResult = false;
  bResult = runTest(argc, argv);
  cudaDeviceReset();
  printf(bResult ? "Test passed\n" : "Test failed!\n");
}

int reduceCPU(int *data, int size) {
  int sum = data[0];
  int c = (int)0.0;
  for (int i = 1; i < size; i++) {
    int y = data[i] - c;
    int t = sum + y;
    c = (t - sum) - y;
    sum = t;
  }
  return sum;
}
unsigned int nextPow2(unsigned int x) {
  --x;
  x |= x >> 1;
  x |= x >> 2;
  x |= x >> 4;
  x |= x >> 8;
  x |= x >> 16;
  return ++x;
}

  ///////////////
 // functions //
///////////////

#ifndef MIN
#define MIN(x,y) ((x < y) ? x : y)
#endif

void getNumBlocksAndThreads(int n, int maxBlocks, int maxThreads,
                            int &blocks, int &threads) {
  // get device capability
  cudaDeviceProp prop;
  int device;
  checkCudaErrors(cudaGetDevice(&device));
  checkCudaErrors(cudaGetDeviceProperties(&prop, device));
  threads = (n < maxThreads * 2) ? nextPow2((n + 1) / 2) : maxThreads;
  blocks = (n + (threads * 2 - 1)) / (threads * 2);
  // device capability check
  if (threads * blocks > prop.maxGridSize[0] * prop.maxThreadsPerBlock)
    printf("n is too large -- device capability exceeded\n");
  if (blocks > prop.maxGridSize[0]) {
    printf("grid size <%d> exceeds device capability <%d> -- set block size as %d (original %d)\n",
           blocks, prop.maxGridSize[0], threads * 2, threads);
    blocks /= 2;
    threads *= 2;
  }
  blocks = MIN(maxBlocks, blocks);
}

int benchmarkReduce(int n,
                    int numThreads,
                    int numBlocks,
                    int maxThreads,
                    int maxBlocks,
                    int kernel,
                    int testIterations,
                    bool cpuFinalReduction,
                    int cpuFinalThreshold,
                    StopWatchInterface *timer,
                    int *h_odata,
                    int *d_idata,
                    int *d_odata) {
  int gpu_result = 0;
  bool needReadBack = true;
  for (int i = 0; i < testIterations; ++i) {
    gpu_result = 0;
    cudaDeviceSynchronize();
    sdkStartTimer(&timer);
    // execute kernel
    reduce(n, numThreads, numBlocks, kernel, d_idata, d_odata);
    // check if kernel execution generated an error
    getLastCudaError("ERROR: kernel excution failed");
    if (cpuFinalReduction) {
      // sum partial sums from each block on CPU
      // copy result from device to host
      checkCudaErrors(cudaMemcpy(h_odata, d_odata, numBlocks * sizeof(int), cudaMemcpyDeviceToHost));
      for (int i = 0; i < numBlocks; i++)
        gpu_result += h_odata[i];
      needReadBack = false;
    }
    else {
      // sum partial block sums on GPU
      int s = numBlocks;
      while (s > cpuFinalThreshold) {
        int threads = 0, blocks = 0;
        getNumBlocksAndThreads(s, maxBlocks, maxThreads, blocks, threads);
        reduce(s, threads, blocks, kernel, d_odata, d_odata);
        s = (s + (threads * 2 - 1)) / (threads * 2);
      }
      if (s > 1) {
        // copy result from device to host
        checkCudaErrors(cudaMemcpy(h_odata, d_odata, s * sizeof(int), cudaMemcpyDeviceToHost));
        for (int i = 0; i < s; i++)
          gpu_result += h_odata[i];
        needReadBack = false;
      }
    }
    cudaDeviceSynchronize();
    sdkStopTimer(&timer);
  }
  if (needReadBack)
    checkCudaErrors(cudaMemcpy(&gpu_result, d_odata, sizeof(int), cudaMemcpyDeviceToHost));
  return gpu_result;
}

bool runTest(int argc, char **argv) {
  int size = 1<<24;    // number of elements to reduce
  int maxThreads = 256;  // number of threads per block
  int kernel = 0;
  int maxBlocks = 64;
  bool cpuFinalReduction = false;
  int cpuFinalThreshold = 1;
  if (checkCmdLineFlag(argc, (const char **) argv, "n"))
    size = getCmdLineArgumentInt(argc, (const char **) argv, "n");
  if (checkCmdLineFlag(argc, (const char **) argv, "threads"))
    maxThreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads");
  if (checkCmdLineFlag(argc, (const char **) argv, "kernel"))
    kernel = getCmdLineArgumentInt(argc, (const char **) argv, "kernel");
  if (checkCmdLineFlag(argc, (const char **) argv, "maxblocks"))
    maxBlocks = getCmdLineArgumentInt(argc, (const char **) argv, "maxblocks");
  printf("%d elements\n", size);
  printf("%d threads (max)\n", maxThreads);
  cpuFinalReduction = (bool)(checkCmdLineFlag(argc, (const char **) argv, "cpufinal") == true);
  if (checkCmdLineFlag(argc, (const char **) argv, "cputhresh"))
    cpuFinalThreshold = getCmdLineArgumentInt(argc, (const char **) argv, "cputhresh");
  // create random input data on CPU
  unsigned int bytes = size * sizeof(int);
  int *h_idata = (int*) malloc(bytes);
  for (int i = 0; i < size; i++)
  // keep data small so we don't get truncation error in the sum
    h_idata[i] = (int)(rand() & 0xFF);
  int numBlocks = 0;
  int numThreads = 0;
  getNumBlocksAndThreads(size, maxBlocks, maxThreads, numBlocks, numThreads);
  if (numBlocks == 1)
    cpuFinalThreshold = 1;
  // allocate mem for the result on host side
  int *h_odata = (int*) malloc(numBlocks*sizeof(int));
  printf("%d blocks\n\n", numBlocks);
  // allocate device memory and data
  int *d_idata = NULL;
  int *d_odata = NULL;
  checkCudaErrors(cudaMalloc((void **) &d_idata, bytes));
  checkCudaErrors(cudaMalloc((void **) &d_odata, numBlocks*sizeof(int)));
  // copy data directly to device memory
  checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(int), cudaMemcpyHostToDevice));
  // warm-up
  reduce(size, numThreads, numBlocks, kernel, d_idata, d_odata);
  int testIterations = 100;
  StopWatchInterface *timer = 0;
  sdkCreateTimer(&timer);
  int gpu_result = 0;
  gpu_result = benchmarkReduce(size, numThreads, numBlocks, maxThreads, maxBlocks,
					kernel, testIterations, cpuFinalReduction,
					cpuFinalThreshold, timer,
					h_odata, d_idata, d_odata);
  double reduceTime = sdkGetAverageTimerValue(&timer) * 1e-3;
  printf("Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n",
         1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads);
  // compute reference solution
  int cpu_result = reduceCPU(h_idata, size);
  printf("\nGPU result = %d\n", gpu_result);
  printf("CPU result = %d\n\n", cpu_result);
  // cleanup
  sdkDeleteTimer(&timer);
  free(h_idata);
  free(h_odata);
  checkCudaErrors(cudaFree(d_idata));
  checkCudaErrors(cudaFree(d_odata));
  return (gpu_result == cpu_result);
}