Guest User

Untitled

a guest
Jan 16th, 2018
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.50 KB | None | 0 0
  1. #include <cuda_runtime.h>
  2. #include <iostream>
  3. #include <device_launch_parameters.h>
  4. #include "cuda_error_detector.h"
  5. #include <vector>
  6.  
  7.  
  8. #ifdef __CUDACC__
  9. #define KERNEL_ARGS2(numerOfBlocks, threadsPerBlock) <<< numerOfBlocks, threadsPerBlock >>>
  10. #define KERNEL_ARGS3(numerOfBlocks, threadsPerBlock, sh_mem) <<< numerOfBlocks, threadsPerBlock, sh_mem >>>
  11. #define KERNEL_ARGS4(numerOfBlocks, threadsPerBlock, sh_mem, stream) <<< numerOfBlocks, threadsPerBlock, sh_mem, stream >>>
  12. #else
  13. #define KERNEL_ARGS2(numerOfBlocks, threadsPerBlock)
  14. #define KERNEL_ARGS3(numerOfBlocks, threadsPerBlock, sh_mem)
  15. #define KERNEL_ARGS4(numerOfBlocks, threadsPerBlock, sh_mem, stream)
  16. #endif
  17.  
  18. namespace add_loop_blocks
  19. {
  20. const int number_of_blocks = 100;
  21. const int threads_per_block = 2;
  22. const int number_of_threads = number_of_blocks * threads_per_block;
  23. const int number_of_data = 654321;
  24.  
  25. __device__ int get_id()
  26. {
  27. int block_id = blockIdx.z * (gridDim.x * gridDim.y)
  28. + blockIdx.y * (gridDim.x)
  29. + blockIdx.x;
  30. int threadId = block_id * (blockDim.x * blockDim.y * blockDim.z)
  31. + (threadIdx.z * (blockDim.x * blockDim.y))
  32. + (threadIdx.y * blockDim.x)
  33. + threadIdx.x;
  34. return threadId;
  35. }
  36.  
  37.  
  38. __global__ void add(const int* a, const int* b, int *c)
  39. {
  40. int id = get_id();
  41. while (id < number_of_data)
  42. {
  43. c[id] = a[id] + b[id];
  44. id += number_of_threads;
  45. }
  46. }
  47.  
  48. int main_add_loop_blocks(void)
  49. {
  50. const auto sizes = number_of_data * sizeof(int);
  51. int* a = nullptr;
  52. int* b = nullptr;
  53. int* c = nullptr;
  54.  
  55. //allocate managed memory, which is used by cpu and gpu
  56. {
  57. CudaSafeCall(cudaMallocManaged(&a, sizes));
  58. CudaSafeCall(cudaMallocManaged(&b, sizes));
  59. CudaSafeCall(cudaMallocManaged(&c, sizes));
  60. }
  61.  
  62. //assign initial values. cpu uses managed memory.
  63. for (int i = 0; i < number_of_data; ++i)
  64. {
  65. a[i] = i;
  66. b[i] = i + 1;
  67. }
  68.  
  69. //add. gpu uses managed memory.
  70. add_loop_blocks::add KERNEL_ARGS2(number_of_blocks, threads_per_block)(a, b, c);
  71.  
  72. // wait
  73. CudaSafeCall(cudaDeviceSynchronize());
  74.  
  75. // if you want to show all variable, comment out.
  76. // for (int i = 0; i < number_of_data; ++i)
  77. // {
  78. // std::cout << i << " is " << a[i] << " " << b[i] << " " << c[i] << "\n";
  79. // }
  80.  
  81. // show only last item.
  82. const auto i = number_of_data - 1;
  83. std::cout << i << " is " << a[i] << " " << b[i] << " " << c[i] << "\n";
  84.  
  85. //free
  86. {
  87. CudaSafeCall(cudaFree(a));
  88. CudaSafeCall(cudaFree(b));
  89. CudaSafeCall(cudaFree(c));
  90. }
  91.  
  92. return 0;
  93. }
  94. }
  95.  
  96.  
  97. int main()
  98. {
  99. add_loop_blocks::main_add_loop_blocks();
  100. return 0;
  101. }
Add Comment
Please, Sign In to add comment