Advertisement
Guest User

Untitled

a guest
Jun 19th, 2013
165
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 24.18 KB | None | 0 0
  1. /*
  2. COMMAND LINE ARGUMENTS
  3.  
  4. "--n=<N>": Specify the number of elements to reduce (default 1048576)
  5. "--threads=<N>": Specify the number of threads per block (default 128)
  6. "--kernel=<N>": Specify which kernel to run (0-6, default 6)
  7. "--maxblocks=<N>": Specify the maximum number of thread blocks to launch (kernel 6 only, default 64)
  8. "--cpufinal": Read back the per-block results and do final sum of block sums on CPU (default false)
  9. "--cputhresh=<N>": The threshold of number of blocks sums below which to perform a CPU final reduction (default 1)
  10. */
  11.  
  12. #include <stdio.h>
  13. #include <stdlib.h>
  14. #include <cuda_runtime.h>
  15. #include <helper_cuda.h>
  16. #include <helper_functions.h>
  17. #include <algorithm>
  18. #include "redAtom.h"
  19.  
  20. /////////////////
  21. // warp reduce //
  22. /////////////////
  23. /* warp-synchronous reduction using volatile memory
  24. * to prevent instruction reordering for non-atomic
  25. * operations */
  26.  
  27. template <unsigned int blockSize>
  28. __device__ void warpReduce(volatile int *sdata, int tid) {
  29. if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
  30. if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
  31. if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
  32. if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
  33. if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
  34. if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
  35. }
  36.  
  37. ////////////////////////
  38. // atomic warp reduce //
  39. ////////////////////////
  40. /* warp-synchronous reduction using atomic operations
  41. * to serialize computation */
  42.  
  43. template <unsigned int blockSize>
  44. __device__ void atomicWarpReduce(int *sdata, int tid) {
  45. if (blockSize >= 64) atomicAdd(&sdata[tid], sdata[tid + 32]);
  46. if (blockSize >= 32) atomicAdd(&sdata[tid], sdata[tid + 16]);
  47. if (blockSize >= 16) atomicAdd(&sdata[tid], sdata[tid + 8]);
  48. if (blockSize >= 8) atomicAdd(&sdata[tid], sdata[tid + 4]);
  49. if (blockSize >= 4) atomicAdd(&sdata[tid], sdata[tid + 2]);
  50. if (blockSize >= 2) atomicAdd(&sdata[tid], sdata[tid + 1]);
  51. }
  52.  
  53. ////////////////////////
  54. // reduction kernel 0 //
  55. ////////////////////////
  56. /* fastest reduction algorithm provided by
  57. * cuda/samples/6_Advanced/reduction/reduction_kernel.cu */
  58.  
  59. template <unsigned int blockSize, bool nIsPow2>
  60. __global__ void reduce0(int *g_idata, int *g_odata, unsigned int n) {
  61. extern __shared__ int sdata[];
  62. // first level of reduction (global -> shared)
  63. unsigned int tid = threadIdx.x;
  64. unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
  65. unsigned int gridSize = blockSize * 2 * gridDim.x;
  66. int sum = 0;
  67. // reduce multiple elements per thread
  68. while (i < n) {
  69. sum += g_idata[i];
  70. // check bounds
  71. if (nIsPow2 || i + blockSize < n)
  72. sum += g_idata[i + blockSize];
  73. i += gridSize;
  74. }
  75. // local sum -> shared memory
  76. sdata[tid] = sum;
  77. __syncthreads();
  78. // reduce in shared memory
  79. if (blockSize >= 512) {
  80. if (tid < 256)
  81. sdata[tid] = sum = sum + sdata[tid + 256];
  82. __syncthreads();
  83. }
  84. if (blockSize >= 256) {
  85. if (tid < 128)
  86. sdata[tid] = sum = sum + sdata[tid + 128];
  87. __syncthreads();
  88. }
  89. if (blockSize >= 128) {
  90. if (tid < 64)
  91. sdata[tid] = sum = sum + sdata[tid + 64];
  92. __syncthreads();
  93. }
  94. if (tid < 32) {
  95. // warp-synchronous reduction
  96. // volatile memory stores won't be reordered by compiler
  97. volatile int *smem = sdata;
  98. if (blockSize >= 64)
  99. smem[tid] = sum = sum + smem[tid + 32];
  100. if (blockSize >= 32)
  101. smem[tid] = sum = sum + smem[tid + 16];
  102. if (blockSize >= 16)
  103. smem[tid] = sum = sum + smem[tid + 8];
  104. if (blockSize >= 8)
  105. smem[tid] = sum = sum + smem[tid + 4];
  106. if (blockSize >= 4)
  107. smem[tid] = sum = sum + smem[tid + 2];
  108. if (blockSize >= 2)
  109. smem[tid] = sum = sum + smem[tid + 1];
  110. }
  111. // write result for block to global memory
  112. if (tid == 0)
  113. g_odata[blockIdx.x] = sdata[0];
  114. }
  115.  
  116. /////////////////////////
  117. // reduction kernel 1 //
  118. /////////////////////////
  119. /* fastest reduction alrogithm described in
  120. * cuda/samples/6_Advanced/reduction/doc/reduction.pdf */
  121.  
  122. template <unsigned int blockSize>
  123. __global__ void reduce1(int *g_idata, int *g_odata, unsigned int n) {
  124. extern __shared__ int sdata[];
  125. // first level of reduction (global -> shared)
  126. unsigned int tid = threadIdx.x;
  127. unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
  128. unsigned int gridSize = blockSize * 2 * gridDim.x;
  129. sdata[tid] = 0;
  130. // reduce multiple elements per thread
  131. while (i < n) {
  132. sdata[tid] += g_idata[i] + g_idata[i+blockSize];
  133. i += gridSize;
  134. }
  135. __syncthreads();
  136. // reduce in shared memory
  137. if (blockSize >= 512) {
  138. if (tid < 256)
  139. sdata[tid] += sdata[tid + 256];
  140. __syncthreads();
  141. }
  142. if (blockSize >= 256) {
  143. if (tid < 128)
  144. sdata[tid] += sdata[tid + 128];
  145. __syncthreads();
  146. }
  147. if (blockSize >= 128) {
  148. if (tid < 64)
  149. sdata[tid] += sdata[tid + 64];
  150. __syncthreads();
  151. }
  152. if (tid < 32) warpReduce<blockSize>(sdata, tid);
  153. // write result for block to global memory
  154. if (tid == 0)
  155. g_odata[blockIdx.x] = sdata[0];
  156. }
  157.  
  158. /////////////////////////
  159. // reduction kernel 2 //
  160. /////////////////////////
  161. /* reduction kernel 1 executed
  162. * with atomic warp-synchronous addition */
  163.  
  164. template <unsigned int blockSize>
  165. __global__ void reduce2(int *g_idata, int *g_odata, unsigned int n) {
  166. extern __shared__ int sdata[];
  167. // first level of reduction (global -> shared)
  168. unsigned int tid = threadIdx.x;
  169. unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
  170. unsigned int gridSize = blockSize * 2 * gridDim.x;
  171. sdata[tid] = 0;
  172. // reduce multiple elements per thread
  173. while (i < n) {
  174. sdata[tid] += g_idata[i] + g_idata[i+blockSize];
  175. i += gridSize;
  176. }
  177. __syncthreads();
  178. // reduce in shared memory
  179. if (blockSize >= 512) {
  180. if (tid < 256)
  181. sdata[tid] += sdata[tid + 256];
  182. __syncthreads();
  183. }
  184. if (blockSize >= 256) {
  185. if (tid < 128)
  186. sdata[tid] += sdata[tid + 128];
  187. __syncthreads();
  188. }
  189. if (blockSize >= 128) {
  190. if (tid < 64)
  191. sdata[tid] += sdata[tid + 64];
  192. __syncthreads();
  193. }
  194. if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
  195. // write result for block to global memory
  196. if (tid == 0)
  197. g_odata[blockIdx.x] = sdata[0];
  198. }
  199.  
  200. /////////////////////////
  201. // reduction kernel 3 //
  202. /////////////////////////
  203.  
  204. template <unsigned int blockSize>
  205. __global__ void reduce3(int *g_idata, int *g_odata, unsigned int n) {
  206. extern __shared__ int sdata[];
  207. // first level of reduction (global -> shared)
  208. unsigned int tid = threadIdx.x;
  209. unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
  210. unsigned int gridSize = blockSize * 2 * gridDim.x;
  211. sdata[tid] = 0;
  212. // reduce multiple elements per thread
  213. while (i < n) {
  214. sdata[tid] += g_idata[i] + g_idata[i+blockSize];
  215. i += gridSize;
  216. }
  217. __syncthreads();
  218. // reduce in shared memory
  219. if (blockSize >= 512) {
  220. if (tid < 256)
  221. atomicAdd(&sdata[tid], sdata[tid + 256]);
  222. __syncthreads();
  223. }
  224. if (blockSize >= 256) {
  225. if (tid < 128)
  226. atomicAdd(&sdata[tid], sdata[tid + 128]);
  227. __syncthreads();
  228. }
  229. if (blockSize >= 128) {
  230. if (tid < 64)
  231. atomicAdd(&sdata[tid], sdata[tid + 64]);
  232. __syncthreads();
  233. }
  234. if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
  235. // write result for block to global memory
  236. if (tid == 0)
  237. g_odata[blockIdx.x] = sdata[0];
  238. }
  239.  
  240. /////////////////////////
  241. // reduction kernel 4 //
  242. /////////////////////////
  243.  
  244. template <unsigned int blockSize>
  245. __global__ void reduce4(int *g_idata, int *g_odata, unsigned int n) {
  246. extern __shared__ int sdata[];
  247. // first level of reduction (global -> shared)
  248. unsigned int tid = threadIdx.x;
  249. unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
  250. unsigned int gridSize = blockSize * 2 * gridDim.x;
  251. sdata[tid] = 0;
  252. // reduce multiple elements per thread
  253. while (i < n) {
  254. atomicAdd(&sdata[tid], (g_idata[i] + g_idata[i+blockSize]));
  255. i += gridSize;
  256. }
  257. __syncthreads();
  258. // reduce in shared memory
  259. if (blockSize >= 512) {
  260. if (tid < 256)
  261. atomicAdd(&sdata[tid], sdata[tid + 256]);
  262. __syncthreads();
  263. }
  264. if (blockSize >= 256) {
  265. if (tid < 128)
  266. atomicAdd(&sdata[tid], sdata[tid + 128]);
  267. __syncthreads();
  268. }
  269. if (blockSize >= 128) {
  270. if (tid < 64)
  271. atomicAdd(&sdata[tid], sdata[tid + 64]);
  272. __syncthreads();
  273. }
  274. if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
  275. // write result for block to global memory
  276. if (tid == 0)
  277. g_odata[blockIdx.x] = sdata[0];
  278. }
  279.  
  280. ////////////////////////
  281. // reduction kernel 9 //
  282. ////////////////////////
  283. /* this reduction kernel interleaves which threads are active by using the modulo
  284. * operator, this operator is very expensive on GPUs, and the interleaved
  285. * inactivity means that no whole warps are active, which is also very inefficient */
  286. __global__ void
  287. reduce9(int *g_idata, int *g_odata, unsigned int n) {
  288. extern __shared__ int sdata[];
  289. // load shared memory
  290. unsigned int tid = threadIdx.x;
  291. unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
  292. sdata[tid] = (i < n) ? g_idata[i] : 0;
  293. __syncthreads();
  294. // do reduction in shared memory
  295. for (unsigned int s=1; s < blockDim.x; s *= 2) {
  296. // modulo arithmetic is slow
  297. if ((tid % (2*s)) == 0)
  298. sdata[tid] += sdata[tid + s];
  299. __syncthreads();
  300. }
  301. // write result for this block to global mem
  302. if (tid == 0) g_odata[blockIdx.x] = sdata[0];
  303. }
  304.  
  305. extern "C" bool isPow2(unsigned int x);
  306.  
  307. /////////////////////////////
  308. // kernel wrapper function //
  309. /////////////////////////////
  310. /* computes shared memory size, selects corresponding templates
  311. * for each reduction kernel and launches them */
  312.  
  313. void reduce(int size, int threads, int blocks,
  314. int kernel, int *d_idata, int *d_odata) {
  315. dim3 dimBlock(threads, 1, 1);
  316. dim3 dimGrid(blocks, 1 ,1);
  317. // if there is only one warp per block allocate two warps worth
  318. // of shared memory to avoid indexing shared memory out of bounds
  319. int smemSize = (threads <= 32) ? 2 * threads * sizeof(int) : threads * sizeof(int);
  320. // select and launch reduction kernel
  321. switch (kernel) {
  322. case 0:
  323. if (isPow2(size)) {
  324. switch (threads) {
  325. case 512: reduce0<512, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
  326. case 256: reduce0<256, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
  327. case 128: reduce0<128, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
  328. case 64: reduce0< 64, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
  329. case 32: reduce0< 32, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
  330. case 16: reduce0< 16, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
  331. case 8: reduce0< 8, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
  332. case 4: reduce0< 4, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
  333. case 2: reduce0< 2, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
  334. case 1: reduce0< 1, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
  335. }
  336. }
  337. else {
  338. switch (threads) {
  339. case 512: reduce0<512, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
  340. case 256: reduce0<256, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
  341. case 128: reduce0<128, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
  342. case 64: reduce0< 64, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
  343. case 32: reduce0< 32, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
  344. case 16: reduce0< 16, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
  345. case 8: reduce0< 8, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
  346. case 4: reduce0< 4, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
  347. case 2: reduce0< 2, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
  348. case 1: reduce0< 1, true><<<dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
  349. }
  350. }
  351. break;
  352. case 1:
  353. switch (threads) {
  354. case 512: reduce1<512><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  355. case 256: reduce1<256><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  356. case 128: reduce1<128><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  357. case 64: reduce1< 64><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  358. case 32: reduce1< 32><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  359. case 16: reduce1< 16><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  360. case 8: reduce1< 8><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  361. case 4: reduce1< 4><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  362. case 2: reduce1< 2><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  363. case 1: reduce1< 1><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  364. }
  365. break;
  366. case 2:
  367. switch (threads) {
  368. case 512: reduce2<512><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  369. case 256: reduce2<256><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  370. case 128: reduce2<128><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  371. case 64: reduce2< 64><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  372. case 32: reduce2< 32><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  373. case 16: reduce2< 16><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  374. case 8: reduce2< 8><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  375. case 4: reduce2< 4><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  376. case 2: reduce2< 2><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  377. case 1: reduce2< 1><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  378. }
  379. break;
  380. case 3:
  381. switch (threads) {
  382. case 512: reduce3<512><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  383. case 256: reduce3<256><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  384. case 128: reduce3<128><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  385. case 64: reduce3< 64><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  386. case 32: reduce3< 32><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  387. case 16: reduce3< 16><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  388. case 8: reduce3< 8><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  389. case 4: reduce3< 4><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  390. case 2: reduce3< 2><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  391. case 1: reduce3< 1><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  392. }
  393. break;
  394. case 4:
  395. switch (threads) {
  396. case 512: reduce4<512><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  397. case 256: reduce4<256><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  398. case 128: reduce4<128><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  399. case 64: reduce4< 64><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  400. case 32: reduce4< 32><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  401. case 16: reduce4< 16><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  402. case 8: reduce4< 8><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  403. case 4: reduce4< 4><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  404. case 2: reduce4< 2><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  405. case 1: reduce4< 1><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  406. }
  407. break;
  408. case 9: reduce9<<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size); break;
  409. }
  410. }
  411.  
  412. //////////////////////////
  413. // forward declarations //
  414. //////////////////////////
  415.  
  416. bool runTest(int argc, char **argv);
  417. #define MAX_BLOCK_DIM_SIZE 65535
  418. #ifdef WIN32
  419. #define strcasecmp strcmpi
  420. #endif
  421. extern "C" bool isPow2(unsigned int x) {
  422. return ((x&(x-1))==0);
  423. }
  424.  
  425. //////////////////
  426. // program main //
  427. //////////////////
  428.  
  429. int main(int argc, char **argv) {
  430. printf("%s Starting...\n\n", argv[0]);
  431. cudaDeviceProp deviceProp;
  432. deviceProp.major = 1;
  433. deviceProp.minor = 0;
  434. int minimumComputeVersion = 10;
  435. int dev;
  436. dev = findCudaDevice(argc, (const char **)argv);
  437. checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
  438. if ((deviceProp.major * 10 + deviceProp.minor) >= minimumComputeVersion) {
  439. printf("Using Device %d: %s\n\n", dev, deviceProp.name);
  440. checkCudaErrors(cudaSetDevice(dev));
  441. }
  442. else {
  443. printf("Error: the selected device does not support the minimum compute capability of %d.%d.\n\n",
  444. minimumComputeVersion / 10, minimumComputeVersion % 10);
  445. cudaDeviceReset();
  446. exit(EXIT_FAILURE);
  447. }
  448. printf("Reducing Integer Array...\n\n");
  449. bool bResult = false;
  450. bResult = runTest(argc, argv);
  451. cudaDeviceReset();
  452. printf(bResult ? "Test passed\n" : "Test failed!\n");
  453. }
  454.  
  455. int reduceCPU(int *data, int size) {
  456. int sum = data[0];
  457. int c = (int)0.0;
  458. for (int i = 1; i < size; i++) {
  459. int y = data[i] - c;
  460. int t = sum + y;
  461. c = (t - sum) - y;
  462. sum = t;
  463. }
  464. return sum;
  465. }
  466. unsigned int nextPow2(unsigned int x) {
  467. --x;
  468. x |= x >> 1;
  469. x |= x >> 2;
  470. x |= x >> 4;
  471. x |= x >> 8;
  472. x |= x >> 16;
  473. return ++x;
  474. }
  475.  
  476. ///////////////
  477. // functions //
  478. ///////////////
  479.  
  480. #ifndef MIN
  481. #define MIN(x,y) ((x < y) ? x : y)
  482. #endif
  483.  
  484. void getNumBlocksAndThreads(int n, int maxBlocks, int maxThreads,
  485. int &blocks, int &threads) {
  486. // get device capability
  487. cudaDeviceProp prop;
  488. int device;
  489. checkCudaErrors(cudaGetDevice(&device));
  490. checkCudaErrors(cudaGetDeviceProperties(&prop, device));
  491. threads = (n < maxThreads * 2) ? nextPow2((n + 1) / 2) : maxThreads;
  492. blocks = (n + (threads * 2 - 1)) / (threads * 2);
  493. // device capability check
  494. if (threads * blocks > prop.maxGridSize[0] * prop.maxThreadsPerBlock)
  495. printf("n is too large -- device capability exceeded\n");
  496. if (blocks > prop.maxGridSize[0]) {
  497. printf("grid size <%d> exceeds device capability <%d> -- set block size as %d (original %d)\n",
  498. blocks, prop.maxGridSize[0], threads * 2, threads);
  499. blocks /= 2;
  500. threads *= 2;
  501. }
  502. blocks = MIN(maxBlocks, blocks);
  503. }
  504.  
  505. int benchmarkReduce(int n,
  506. int numThreads,
  507. int numBlocks,
  508. int maxThreads,
  509. int maxBlocks,
  510. int kernel,
  511. int testIterations,
  512. bool cpuFinalReduction,
  513. int cpuFinalThreshold,
  514. StopWatchInterface *timer,
  515. int *h_odata,
  516. int *d_idata,
  517. int *d_odata) {
  518. int gpu_result = 0;
  519. bool needReadBack = true;
  520. for (int i = 0; i < testIterations; ++i) {
  521. gpu_result = 0;
  522. cudaDeviceSynchronize();
  523. sdkStartTimer(&timer);
  524. // execute kernel
  525. reduce(n, numThreads, numBlocks, kernel, d_idata, d_odata);
  526. // check if kernel execution generated an error
  527. getLastCudaError("ERROR: kernel excution failed");
  528. if (cpuFinalReduction) {
  529. // sum partial sums from each block on CPU
  530. // copy result from device to host
  531. checkCudaErrors(cudaMemcpy(h_odata, d_odata, numBlocks * sizeof(int), cudaMemcpyDeviceToHost));
  532. for (int i = 0; i < numBlocks; i++)
  533. gpu_result += h_odata[i];
  534. needReadBack = false;
  535. }
  536. else {
  537. // sum partial block sums on GPU
  538. int s = numBlocks;
  539. while (s > cpuFinalThreshold) {
  540. int threads = 0, blocks = 0;
  541. getNumBlocksAndThreads(s, maxBlocks, maxThreads, blocks, threads);
  542. reduce(s, threads, blocks, kernel, d_odata, d_odata);
  543. s = (s + (threads * 2 - 1)) / (threads * 2);
  544. }
  545. if (s > 1) {
  546. // copy result from device to host
  547. checkCudaErrors(cudaMemcpy(h_odata, d_odata, s * sizeof(int), cudaMemcpyDeviceToHost));
  548. for (int i = 0; i < s; i++)
  549. gpu_result += h_odata[i];
  550. needReadBack = false;
  551. }
  552. }
  553. cudaDeviceSynchronize();
  554. sdkStopTimer(&timer);
  555. }
  556. if (needReadBack)
  557. checkCudaErrors(cudaMemcpy(&gpu_result, d_odata, sizeof(int), cudaMemcpyDeviceToHost));
  558. return gpu_result;
  559. }
  560.  
  561. bool runTest(int argc, char **argv) {
  562. int size = 1<<24; // number of elements to reduce
  563. int maxThreads = 256; // number of threads per block
  564. int kernel = 0;
  565. int maxBlocks = 64;
  566. bool cpuFinalReduction = false;
  567. int cpuFinalThreshold = 1;
  568. if (checkCmdLineFlag(argc, (const char **) argv, "n"))
  569. size = getCmdLineArgumentInt(argc, (const char **) argv, "n");
  570. if (checkCmdLineFlag(argc, (const char **) argv, "threads"))
  571. maxThreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads");
  572. if (checkCmdLineFlag(argc, (const char **) argv, "kernel"))
  573. kernel = getCmdLineArgumentInt(argc, (const char **) argv, "kernel");
  574. if (checkCmdLineFlag(argc, (const char **) argv, "maxblocks"))
  575. maxBlocks = getCmdLineArgumentInt(argc, (const char **) argv, "maxblocks");
  576. printf("%d elements\n", size);
  577. printf("%d threads (max)\n", maxThreads);
  578. cpuFinalReduction = (bool)(checkCmdLineFlag(argc, (const char **) argv, "cpufinal") == true);
  579. if (checkCmdLineFlag(argc, (const char **) argv, "cputhresh"))
  580. cpuFinalThreshold = getCmdLineArgumentInt(argc, (const char **) argv, "cputhresh");
  581. // create random input data on CPU
  582. unsigned int bytes = size * sizeof(int);
  583. int *h_idata = (int*) malloc(bytes);
  584. for (int i = 0; i < size; i++)
  585. // keep data small so we don't get truncation error in the sum
  586. h_idata[i] = (int)(rand() & 0xFF);
  587. int numBlocks = 0;
  588. int numThreads = 0;
  589. getNumBlocksAndThreads(size, maxBlocks, maxThreads, numBlocks, numThreads);
  590. if (numBlocks == 1)
  591. cpuFinalThreshold = 1;
  592. // allocate mem for the result on host side
  593. int *h_odata = (int*) malloc(numBlocks*sizeof(int));
  594. printf("%d blocks\n\n", numBlocks);
  595. // allocate device memory and data
  596. int *d_idata = NULL;
  597. int *d_odata = NULL;
  598. checkCudaErrors(cudaMalloc((void **) &d_idata, bytes));
  599. checkCudaErrors(cudaMalloc((void **) &d_odata, numBlocks*sizeof(int)));
  600. // copy data directly to device memory
  601. checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice));
  602. checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(int), cudaMemcpyHostToDevice));
  603. // warm-up
  604. reduce(size, numThreads, numBlocks, kernel, d_idata, d_odata);
  605. int testIterations = 100;
  606. StopWatchInterface *timer = 0;
  607. sdkCreateTimer(&timer);
  608. int gpu_result = 0;
  609. gpu_result = benchmarkReduce(size, numThreads, numBlocks, maxThreads, maxBlocks,
  610. kernel, testIterations, cpuFinalReduction,
  611. cpuFinalThreshold, timer,
  612. h_odata, d_idata, d_odata);
  613. double reduceTime = sdkGetAverageTimerValue(&timer) * 1e-3;
  614. printf("Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n",
  615. 1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads);
  616. // compute reference solution
  617. int cpu_result = reduceCPU(h_idata, size);
  618. printf("\nGPU result = %d\n", gpu_result);
  619. printf("CPU result = %d\n\n", cpu_result);
  620. // cleanup
  621. sdkDeleteTimer(&timer);
  622. free(h_idata);
  623. free(h_odata);
  624. checkCudaErrors(cudaFree(d_idata));
  625. checkCudaErrors(cudaFree(d_odata));
  626. return (gpu_result == cpu_result);
  627. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement