Advertisement
Guest User

Untitled

a guest
Apr 21st, 2019
84
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.26 KB | None | 0 0
  1. #include "cuda_runtime.h"
  2. #include "device_launch_parameters.h"
  3.  
  4. #include <stdio.h>
  5. #include <numeric>
  6. #include <stdlib.h>
  7. #include <cuda.h>
  8.  
  9. /* -------- KERNEL -------- */
  10. global void reduce_kernel(int * d_out, int * d_in, int size)
  11. {
  12. // position and threadId
  13. int pos = blockIdx.x * blockDim.x + threadIdx.x;
  14. int tid = threadIdx.x;
  15.  
  16. // do reduction in global memory
  17. for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
  18. {
  19. if (tid < s)
  20. {
  21. if (pos + s < size) // Handling out of bounds
  22. {
  23. d_in[pos] = d_in[pos] + d_in[pos + s];
  24. }
  25. }
  26. __syncthreads();
  27. }
  28.  
  29. // only thread 0 writes result, as thread
  30. if ((tid == 0) && (pos < size))
  31. {
  32. d_out[blockIdx.x] = d_in[pos];
  33. }
  34. }
  35.  
  36. /* -------- KERNEL WRAPPER -------- */
  37. void reduce(int * d_out, int * d_in, int size, int num_threads)
  38. {
  39. // setting up blocks and intermediate result holder
  40.  
  41. int num_blocks;
  42. if (((size) % num_threads))
  43. {
  44. num_blocks = ((size) / num_threads) + 1;
  45. }
  46. else
  47. {
  48. num_blocks = (size) / num_threads;
  49. }
  50. int * d_intermediate;
  51. cudaMalloc(&d_intermediate, sizeof(int)*num_blocks);
  52. cudaMemset(d_intermediate, 0, sizeof(int)*num_blocks);
  53. int prev_num_blocks;
  54. int i = 1;
  55. int size_rest = 0;
  56. // recursively solving, will run approximately log base num_threads times.
  57. do
  58. {
  59. printf("Round:%.d\n", i);
  60. printf("NumBlocks:%.d\n", num_blocks);
  61. printf("NumThreads:%.d\n", num_threads);
  62. printf("size of array:%.d\n", size);
  63. i++;
  64. reduce_kernel << <num_blocks, num_threads >> > (d_intermediate, d_in, size);
  65. size_rest = size % num_threads;
  66. size = size / num_threads + size_rest;
  67.  
  68. // updating input to intermediate
  69. cudaMemcpy(d_in, d_intermediate, sizeof(int)*num_blocks, cudaMemcpyDeviceToDevice);
  70.  
  71. // Updating num_blocks to reflect how many blocks we now want to compute on
  72. prev_num_blocks = num_blocks;
  73. if (size % num_threads)
  74. {
  75. num_blocks = size / num_threads + 1;
  76. }
  77. else
  78. {
  79. num_blocks = size / num_threads;
  80. }
  81. // updating intermediate
  82. cudaFree(d_intermediate);
  83. cudaMalloc(&d_intermediate, sizeof(int)*num_blocks);
  84. } while (size > num_threads); // if it is too small, compute rest.
  85.  
  86. // computing rest
  87. reduce_kernel << <1, size >> > (d_out, d_in, prev_num_blocks);
  88. }
  89.  
  90. /* -------- MAIN -------- */
  91. int main(int argc, char **argv)
  92. {
  93. cudaEvent_t start, stop;
  94. cudaEventCreate(&start);
  95. cudaEventCreate(&stop);
  96.  
  97. printf("@@STARTING@@ \n");
  98. // Setting num_threads
  99. int num_threads = 512;
  100. // Making non-bogus data and setting it on the GPU
  101. const int size = 1 << 26;
  102. const int size_out = 1;
  103. int * d_in;
  104. int * d_out;
  105. cudaMalloc(&d_in, sizeof(int)*size);
  106. cudaMalloc(&d_out, sizeof(int)*size_out);
  107.  
  108. int * h_in = (int *)malloc(size * sizeof(int));
  109. for (int i = 0; i < size; i++) h_in[i] = 1;
  110. cudaMemcpy(d_in, h_in, sizeof(int)*size, cudaMemcpyHostToDevice);
  111.  
  112. // Running kernel wrapper
  113. cudaEventRecord(start);
  114.  
  115. reduce(d_out, d_in, size, num_threads);
  116. int result;
  117.  
  118. cudaEventRecord(stop);
  119.  
  120. cudaEventSynchronize(stop);
  121. float milliseconds = 0;
  122. cudaEventElapsedTime(&milliseconds, start, stop);
  123. printf("Elapsed time was: %f\n milliseconds", milliseconds);
  124.  
  125. cudaMemcpy(&result, d_out, sizeof(int), cudaMemcpyDeviceToHost);
  126. printf("\nFINAL SUM IS: %d\n", result);
  127. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement