Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /*
- COMMAND LINE ARGUMENTS
- "--n=<N>": Specify the number of elements to reduce (default 1048576)
- "--threads=<N>": Specify the number of threads per block (default 128)
- "--kernel=<N>": Specify which kernel to run (0-6, default 6)
- "--maxblocks=<N>": Specify the maximum number of thread blocks to launch (kernel 6 only, default 64)
- "--cpufinal": Read back the per-block results and do final sum of block sums on CPU (default false)
- "--cputhresh=<N>": The threshold of number of blocks sums below which to perform a CPU final reduction (default 1)
- */
- #include <stdio.h>
- #include <stdlib.h>
- #include <cuda_runtime.h>
- #include <helper_cuda.h>
- #include <helper_functions.h>
- #include <algorithm>
- #include "redAtom.h"
- /////////////////
- // warp reduce //
- /////////////////
- /* warp-synchronous reduction using volatile memory
- * to prevent instruction reordering for non-atomic
- * operations */
- template <unsigned int blockSize>
- __device__ void warpReduce(volatile int *sdata, int tid) {
- if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
- if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
- if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
- if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
- if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
- if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
- }
- ////////////////////////
- // atomic warp reduce //
- ////////////////////////
- /* warp-synchronous reduction using atomic operations
- * to serialize computation */
- template <unsigned int blockSize>
- __device__ void atomicWarpReduce(int *sdata, int tid) {
- if (blockSize >= 64) atomicAdd(&sdata[tid], sdata[tid + 32]);
- if (blockSize >= 32) atomicAdd(&sdata[tid], sdata[tid + 16]);
- if (blockSize >= 16) atomicAdd(&sdata[tid], sdata[tid + 8]);
- if (blockSize >= 8) atomicAdd(&sdata[tid], sdata[tid + 4]);
- if (blockSize >= 4) atomicAdd(&sdata[tid], sdata[tid + 2]);
- if (blockSize >= 2) atomicAdd(&sdata[tid], sdata[tid + 1]);
- }
- ////////////////////////
- // reduction kernel 0 //
- ////////////////////////
- /* fastest reduction algorithm provided by
- * cuda/samples/6_Advanced/reduction/reduction_kernel.cu */
- template <unsigned int blockSize, bool nIsPow2>
- __global__ void reduce0(int *g_idata, int *g_odata, unsigned int n) {
- extern __shared__ int sdata[];
- // first level of reduction (global -> shared)
- unsigned int tid = threadIdx.x;
- unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
- unsigned int gridSize = blockSize * 2 * gridDim.x;
- int sum = 0;
- // reduce multiple elements per thread
- while (i < n) {
- sum += g_idata[i];
- // check bounds
- if (nIsPow2 || i + blockSize < n)
- sum += g_idata[i + blockSize];
- i += gridSize;
- }
- // local sum -> shared memory
- sdata[tid] = sum;
- __syncthreads();
- // reduce in shared memory
- if (blockSize >= 512) {
- if (tid < 256)
- sdata[tid] = sum = sum + sdata[tid + 256];
- __syncthreads();
- }
- if (blockSize >= 256) {
- if (tid < 128)
- sdata[tid] = sum = sum + sdata[tid + 128];
- __syncthreads();
- }
- if (blockSize >= 128) {
- if (tid < 64)
- sdata[tid] = sum = sum + sdata[tid + 64];
- __syncthreads();
- }
- if (tid < 32) {
- // warp-synchronous reduction
- // volatile memory stores won't be reordered by compiler
- volatile int *smem = sdata;
- if (blockSize >= 64)
- smem[tid] = sum = sum + smem[tid + 32];
- if (blockSize >= 32)
- smem[tid] = sum = sum + smem[tid + 16];
- if (blockSize >= 16)
- smem[tid] = sum = sum + smem[tid + 8];
- if (blockSize >= 8)
- smem[tid] = sum = sum + smem[tid + 4];
- if (blockSize >= 4)
- smem[tid] = sum = sum + smem[tid + 2];
- if (blockSize >= 2)
- smem[tid] = sum = sum + smem[tid + 1];
- }
- // write result for block to global memory
- if (tid == 0)
- g_odata[blockIdx.x] = sdata[0];
- }
- /////////////////////////
- // reduction kernel 1 //
- /////////////////////////
- /* fastest reduction alrogithm described in
- * cuda/samples/6_Advanced/reduction/doc/reduction.pdf */
- template <unsigned int blockSize>
- __global__ void reduce1(int *g_idata, int *g_odata, unsigned int n) {
- extern __shared__ int sdata[];
- // first level of reduction (global -> shared)
- unsigned int tid = threadIdx.x;
- unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
- unsigned int gridSize = blockSize * 2 * gridDim.x;
- sdata[tid] = 0;
- // reduce multiple elements per thread
- while (i < n) {
- sdata[tid] += g_idata[i] + g_idata[i+blockSize];
- i += gridSize;
- }
- __syncthreads();
- // reduce in shared memory
- if (blockSize >= 512) {
- if (tid < 256)
- sdata[tid] += sdata[tid + 256];
- __syncthreads();
- }
- if (blockSize >= 256) {
- if (tid < 128)
- sdata[tid] += sdata[tid + 128];
- __syncthreads();
- }
- if (blockSize >= 128) {
- if (tid < 64)
- sdata[tid] += sdata[tid + 64];
- __syncthreads();
- }
- if (tid < 32) warpReduce<blockSize>(sdata, tid);
- // write result for block to global memory
- if (tid == 0)
- g_odata[blockIdx.x] = sdata[0];
- }
- /////////////////////////
- // reduction kernel 2 //
- /////////////////////////
- /* reduction kernel 1 executed
- * with atomic warp-synchronous addition */
- template <unsigned int blockSize>
- __global__ void reduce2(int *g_idata, int *g_odata, unsigned int n) {
- extern __shared__ int sdata[];
- // first level of reduction (global -> shared)
- unsigned int tid = threadIdx.x;
- unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
- unsigned int gridSize = blockSize * 2 * gridDim.x;
- sdata[tid] = 0;
- // reduce multiple elements per thread
- while (i < n) {
- sdata[tid] += g_idata[i] + g_idata[i+blockSize];
- i += gridSize;
- }
- __syncthreads();
- // reduce in shared memory
- if (blockSize >= 512) {
- if (tid < 256)
- sdata[tid] += sdata[tid + 256];
- __syncthreads();
- }
- if (blockSize >= 256) {
- if (tid < 128)
- sdata[tid] += sdata[tid + 128];
- __syncthreads();
- }
- if (blockSize >= 128) {
- if (tid < 64)
- sdata[tid] += sdata[tid + 64];
- __syncthreads();
- }
- if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
- // write result for block to global memory
- if (tid == 0)
- g_odata[blockIdx.x] = sdata[0];
- }
- /////////////////////////
- // reduction kernel 3 //
- /////////////////////////
- template <unsigned int blockSize>
- __global__ void reduce3(int *g_idata, int *g_odata, unsigned int n) {
- extern __shared__ int sdata[];
- // first level of reduction (global -> shared)
- unsigned int tid = threadIdx.x;
- unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
- unsigned int gridSize = blockSize * 2 * gridDim.x;
- sdata[tid] = 0;
- // reduce multiple elements per thread
- while (i < n) {
- sdata[tid] += g_idata[i] + g_idata[i+blockSize];
- i += gridSize;
- }
- __syncthreads();
- // reduce in shared memory
- if (blockSize >= 512) {
- if (tid < 256)
- atomicAdd(&sdata[tid], sdata[tid + 256]);
- __syncthreads();
- }
- if (blockSize >= 256) {
- if (tid < 128)
- atomicAdd(&sdata[tid], sdata[tid + 128]);
- __syncthreads();
- }
- if (blockSize >= 128) {
- if (tid < 64)
- atomicAdd(&sdata[tid], sdata[tid + 64]);
- __syncthreads();
- }
- if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
- // write result for block to global memory
- if (tid == 0)
- g_odata[blockIdx.x] = sdata[0];
- }
- /////////////////////////
- // reduction kernel 4 //
- /////////////////////////
- template <unsigned int blockSize>
- __global__ void reduce4(int *g_idata, int *g_odata, unsigned int n) {
- extern __shared__ int sdata[];
- // first level of reduction (global -> shared)
- unsigned int tid = threadIdx.x;
- unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
- unsigned int gridSize = blockSize * 2 * gridDim.x;
- sdata[tid] = 0;
- // reduce multiple elements per thread
- while (i < n) {
- atomicAdd(&sdata[tid], (g_idata[i] + g_idata[i+blockSize]));
- i += gridSize;
- }
- __syncthreads();
- // reduce in shared memory
- if (blockSize >= 512) {
- if (tid < 256)
- atomicAdd(&sdata[tid], sdata[tid + 256]);
- __syncthreads();
- }
- if (blockSize >= 256) {
- if (tid < 128)
- atomicAdd(&sdata[tid], sdata[tid + 128]);
- __syncthreads();
- }
- if (blockSize >= 128) {
- if (tid < 64)
- atomicAdd(&sdata[tid], sdata[tid + 64]);
- __syncthreads();
- }
- if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
- // write result for block to global memory
- if (tid == 0)
- g_odata[blockIdx.x] = sdata[0];
- }
- ////////////////////////
- // reduction kernel 9 //
- ////////////////////////
- /* this reduction kernel interleaves which threads are active by using the modulo
- * operator, this operator is very expensive on GPUs, and the interleaved
- * inactivity means that no whole warps are active, which is also very inefficient */
- __global__ void
- reduce9(int *g_idata, int *g_odata, unsigned int n) {
- extern __shared__ int sdata[];
- // load shared memory
- unsigned int tid = threadIdx.x;
- unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
- sdata[tid] = (i < n) ? g_idata[i] : 0;
- __syncthreads();
- // do reduction in shared memory
- for (unsigned int s=1; s < blockDim.x; s *= 2) {
- // modulo arithmetic is slow
- if ((tid % (2*s)) == 0)
- sdata[tid] += sdata[tid + s];
- __syncthreads();
- }
- // write result for this block to global mem
- if (tid == 0) g_odata[blockIdx.x] = sdata[0];
- }
- extern "C" bool isPow2(unsigned int x);
- /////////////////////////////
- // kernel wrapper function //
- /////////////////////////////
- /* computes shared memory size, selects corresponding templates
- * for each reduction kernel and launches them */
- void reduce(int size, int threads, int blocks,
- int kernel, int *d_idata, int *d_odata) {
- dim3 dimBlock(threads, 1, 1);
- dim3 dimGrid(blocks, 1 ,1);
- // if there is only one warp per block allocate two warps worth
- // of shared memory to avoid indexing shared memory out of bounds
- int smemSize = (threads <= 32) ? 2 * threads * sizeof(int) : threads * sizeof(int);
- // select and launch reduction kernel
- switch (kernel) {
- case 0:
- if (isPow2(size)) {
- switch (threads) {
- case 512: reduce0<512, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
- case 256: reduce0<256, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
- case 128: reduce0<128, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
- case 64: reduce0< 64, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
- case 32: reduce0< 32, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
- case 16: reduce0< 16, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
- case 8: reduce0< 8, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
- case 4: reduce0< 4, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
- case 2: reduce0< 2, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
- case 1: reduce0< 1, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
- }
- }
- else {
- switch (threads) {
- case 512: reduce0<512, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
- case 256: reduce0<256, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
- case 128: reduce0<128, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
- case 64: reduce0< 64, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
- case 32: reduce0< 32, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
- case 16: reduce0< 16, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
- case 8: reduce0< 8, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
- case 4: reduce0< 4, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
- case 2: reduce0< 2, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
- case 1: reduce0< 1, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
- }
- }
- break;
- case 1:
- switch (threads) {
- case 512: reduce1<512><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 256: reduce1<256><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 128: reduce1<128><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 64: reduce1< 64><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 32: reduce1< 32><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 16: reduce1< 16><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 8: reduce1< 8><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 4: reduce1< 4><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 2: reduce1< 2><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 1: reduce1< 1><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- }
- break;
- case 2:
- switch (threads) {
- case 512: reduce2<512><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 256: reduce2<256><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 128: reduce2<128><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 64: reduce2< 64><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 32: reduce2< 32><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 16: reduce2< 16><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 8: reduce2< 8><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 4: reduce2< 4><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 2: reduce2< 2><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 1: reduce2< 1><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- }
- break;
- case 3:
- switch (threads) {
- case 512: reduce3<512><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 256: reduce3<256><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 128: reduce3<128><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 64: reduce3< 64><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 32: reduce3< 32><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 16: reduce3< 16><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 8: reduce3< 8><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 4: reduce3< 4><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 2: reduce3< 2><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 1: reduce3< 1><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- }
- break;
- case 4:
- switch (threads) {
- case 512: reduce4<512><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 256: reduce4<256><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 128: reduce4<128><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 64: reduce4< 64><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 32: reduce4< 32><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 16: reduce4< 16><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 8: reduce4< 8><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 4: reduce4< 4><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 2: reduce4< 2><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- case 1: reduce4< 1><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- }
- break;
- case 9: reduce9<<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
- }
- }
- //////////////////////////
- // forward declarations //
- //////////////////////////
- bool runTest(int argc, char **argv);
- #define MAX_BLOCK_DIM_SIZE 65535
- #ifdef WIN32
- #define strcasecmp strcmpi
- #endif
- extern "C" bool isPow2(unsigned int x) {
- return ((x&(x-1))==0);
- }
- //////////////////
- // program main //
- //////////////////
- int main(int argc, char **argv) {
- printf("%s Starting...\n\n", argv[0]);
- cudaDeviceProp deviceProp;
- deviceProp.major = 1;
- deviceProp.minor = 0;
- int minimumComputeVersion = 10;
- int dev;
- dev = findCudaDevice(argc, (const char **)argv);
- checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
- if ((deviceProp.major * 10 + deviceProp.minor) >= minimumComputeVersion) {
- printf("Using Device %d: %s\n\n", dev, deviceProp.name);
- checkCudaErrors(cudaSetDevice(dev));
- }
- else {
- printf("Error: the selected device does not support the minimum compute capability of %d.%d.\n\n",
- minimumComputeVersion / 10, minimumComputeVersion % 10);
- cudaDeviceReset();
- exit(EXIT_FAILURE);
- }
- printf("Reducing Integer Array...\n\n");
- bool bResult = false;
- bResult = runTest(argc, argv);
- cudaDeviceReset();
- printf(bResult ? "Test passed\n" : "Test failed!\n");
- }
- int reduceCPU(int *data, int size) {
- int sum = data[0];
- int c = (int)0.0;
- for (int i = 1; i < size; i++) {
- int y = data[i] - c;
- int t = sum + y;
- c = (t - sum) - y;
- sum = t;
- }
- return sum;
- }
- unsigned int nextPow2(unsigned int x) {
- --x;
- x |= x >> 1;
- x |= x >> 2;
- x |= x >> 4;
- x |= x >> 8;
- x |= x >> 16;
- return ++x;
- }
- ///////////////
- // functions //
- ///////////////
- #ifndef MIN
- #define MIN(x,y) ((x < y) ? x : y)
- #endif
- void getNumBlocksAndThreads(int n, int maxBlocks, int maxThreads,
- int &blocks, int &threads) {
- // get device capability
- cudaDeviceProp prop;
- int device;
- checkCudaErrors(cudaGetDevice(&device));
- checkCudaErrors(cudaGetDeviceProperties(&prop, device));
- threads = (n < maxThreads * 2) ? nextPow2((n + 1) / 2) : maxThreads;
- blocks = (n + (threads * 2 - 1)) / (threads * 2);
- // device capability check
- if (threads * blocks > prop.maxGridSize[0] * prop.maxThreadsPerBlock)
- printf("n is too large -- device capability exceeded\n");
- if (blocks > prop.maxGridSize[0]) {
- printf("grid size <%d> exceeds device capability <%d> -- set block size as %d (original %d)\n",
- blocks, prop.maxGridSize[0], threads * 2, threads);
- blocks /= 2;
- threads *= 2;
- }
- blocks = MIN(maxBlocks, blocks);
- }
- int benchmarkReduce(int n,
- int numThreads,
- int numBlocks,
- int maxThreads,
- int maxBlocks,
- int kernel,
- int testIterations,
- bool cpuFinalReduction,
- int cpuFinalThreshold,
- StopWatchInterface *timer,
- int *h_odata,
- int *d_idata,
- int *d_odata) {
- int gpu_result = 0;
- bool needReadBack = true;
- for (int i = 0; i < testIterations; ++i) {
- gpu_result = 0;
- cudaDeviceSynchronize();
- sdkStartTimer(&timer);
- // execute kernel
- reduce(n, numThreads, numBlocks, kernel, d_idata, d_odata);
- // check if kernel execution generated an error
- getLastCudaError("ERROR: kernel excution failed");
- if (cpuFinalReduction) {
- // sum partial sums from each block on CPU
- // copy result from device to host
- checkCudaErrors(cudaMemcpy(h_odata, d_odata, numBlocks * sizeof(int), cudaMemcpyDeviceToHost));
- for (int i = 0; i < numBlocks; i++)
- gpu_result += h_odata[i];
- needReadBack = false;
- }
- else {
- // sum partial block sums on GPU
- int s = numBlocks;
- while (s > cpuFinalThreshold) {
- int threads = 0, blocks = 0;
- getNumBlocksAndThreads(s, maxBlocks, maxThreads, blocks, threads);
- reduce(s, threads, blocks, kernel, d_odata, d_odata);
- s = (s + (threads * 2 - 1)) / (threads * 2);
- }
- if (s > 1) {
- // copy result from device to host
- checkCudaErrors(cudaMemcpy(h_odata, d_odata, s * sizeof(int), cudaMemcpyDeviceToHost));
- for (int i = 0; i < s; i++)
- gpu_result += h_odata[i];
- needReadBack = false;
- }
- }
- cudaDeviceSynchronize();
- sdkStopTimer(&timer);
- }
- if (needReadBack)
- checkCudaErrors(cudaMemcpy(&gpu_result, d_odata, sizeof(int), cudaMemcpyDeviceToHost));
- return gpu_result;
- }
- bool runTest(int argc, char **argv) {
- int size = 1<<24; // number of elements to reduce
- int maxThreads = 256; // number of threads per block
- int kernel = 0;
- int maxBlocks = 64;
- bool cpuFinalReduction = false;
- int cpuFinalThreshold = 1;
- if (checkCmdLineFlag(argc, (const char **) argv, "n"))
- size = getCmdLineArgumentInt(argc, (const char **) argv, "n");
- if (checkCmdLineFlag(argc, (const char **) argv, "threads"))
- maxThreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads");
- if (checkCmdLineFlag(argc, (const char **) argv, "kernel"))
- kernel = getCmdLineArgumentInt(argc, (const char **) argv, "kernel");
- if (checkCmdLineFlag(argc, (const char **) argv, "maxblocks"))
- maxBlocks = getCmdLineArgumentInt(argc, (const char **) argv, "maxblocks");
- printf("%d elements\n", size);
- printf("%d threads (max)\n", maxThreads);
- cpuFinalReduction = (bool)(checkCmdLineFlag(argc, (const char **) argv, "cpufinal") == true);
- if (checkCmdLineFlag(argc, (const char **) argv, "cputhresh"))
- cpuFinalThreshold = getCmdLineArgumentInt(argc, (const char **) argv, "cputhresh");
- // create random input data on CPU
- unsigned int bytes = size * sizeof(int);
- int *h_idata = (int*) malloc(bytes);
- for (int i = 0; i < size; i++)
- // keep data small so we don't get truncation error in the sum
- h_idata[i] = (int)(rand() & 0xFF);
- int numBlocks = 0;
- int numThreads = 0;
- getNumBlocksAndThreads(size, maxBlocks, maxThreads, numBlocks, numThreads);
- if (numBlocks == 1)
- cpuFinalThreshold = 1;
- // allocate mem for the result on host side
- int *h_odata = (int*) malloc(numBlocks*sizeof(int));
- printf("%d blocks\n\n", numBlocks);
- // allocate device memory and data
- int *d_idata = NULL;
- int *d_odata = NULL;
- checkCudaErrors(cudaMalloc((void **) &d_idata, bytes));
- checkCudaErrors(cudaMalloc((void **) &d_odata, numBlocks*sizeof(int)));
- // copy data directly to device memory
- checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice));
- checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(int), cudaMemcpyHostToDevice));
- // warm-up
- reduce(size, numThreads, numBlocks, kernel, d_idata, d_odata);
- int testIterations = 100;
- StopWatchInterface *timer = 0;
- sdkCreateTimer(&timer);
- int gpu_result = 0;
- gpu_result = benchmarkReduce(size, numThreads, numBlocks, maxThreads, maxBlocks,
- kernel, testIterations, cpuFinalReduction,
- cpuFinalThreshold, timer,
- h_odata, d_idata, d_odata);
- double reduceTime = sdkGetAverageTimerValue(&timer) * 1e-3;
- printf("Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n",
- 1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads);
- // compute reference solution
- int cpu_result = reduceCPU(h_idata, size);
- printf("\nGPU result = %d\n", gpu_result);
- printf("CPU result = %d\n\n", cpu_result);
- // cleanup
- sdkDeleteTimer(&timer);
- free(h_idata);
- free(h_odata);
- checkCudaErrors(cudaFree(d_idata));
- checkCudaErrors(cudaFree(d_odata));
- return (gpu_result == cpu_result);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement