Advertisement
Guest User

Untitled

a guest
Nov 15th, 2019
130
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.06 KB | None | 0 0
  1.  
  2. #include "cuda_runtime.h"
  3. #include "device_launch_parameters.h"
  4.  
  5. #include <stdio.h>
  6.  
  7. cudaError_t addWithCuda(int *c, int *a, int *b, size_t size);
  8.  
  9. __global__ void addKernel(int *c, int *a, int *b, size_t size)
  10. {
  11. //int i = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.y + threadIdx.x;
  12. int i = blockIdx.x * blockDim.x + threadIdx.x;
  13. int j = blockIdx.y * blockDim.y + threadIdx.y;
  14.  
  15. if(i < size && j < size)
  16. {
  17. int index = i + j * size;
  18. c[index] = a[index] + b[index];
  19. }
  20. }
  21.  
  22. int main()
  23. {
  24. const int arraySize = 5;
  25. int a[arraySize][arraySize];
  26. int b[arraySize][arraySize];
  27. int c[arraySize][arraySize] = { 0 };
  28.  
  29. int i = 0;
  30. int j = 0;
  31. for(i = 0; i < arraySize; ++i)
  32. {
  33. for(j = 0; j < arraySize; ++j)
  34. {
  35. a[i][j] = 10 * i + j;
  36. b[i][j] = 10 * (10 * i + j);
  37. c[i][j] = 0;
  38. }
  39. }
  40.  
  41. // Add vectors in parallel.
  42. cudaError_t cudaStatus = addWithCuda(c[0], a[0], b[0], arraySize);
  43. if (cudaStatus != cudaSuccess) {
  44. fprintf(stderr, "addWithCuda failed!");
  45. return 1;
  46. }
  47.  
  48. for(i = 0; i < arraySize; ++i)
  49. {
  50. for(j = 0; j < arraySize; ++j)
  51. {
  52. printf("%d + %d = %d\n", a[i][j], b[i][j], c[i][j]);
  53. }
  54. }
  55.  
  56. // cudaThreadExit must be called before exiting in order for profiling and
  57. // tracing tools such as Parallel Nsight and Visual Profiler to show complete traces.
  58. cudaStatus = cudaThreadExit();
  59. if (cudaStatus != cudaSuccess) {
  60. fprintf(stderr, "cudaThreadExit failed!");
  61. return 1;
  62. }
  63.  
  64. return 0;
  65. }
  66.  
  67. // Helper function for using CUDA to add vectors in parallel.
  68. cudaError_t addWithCuda(int *c, int *a, int *b, size_t sizeSingle)
  69. {
  70. size_t size = sizeSingle * sizeSingle;
  71. int *dev_a = 0;
  72. int *dev_b = 0;
  73. int *dev_c = 0;
  74. cudaError_t cudaStatus;
  75.  
  76. // Choose which GPU to run on, change this on a multi-GPU system.
  77. cudaStatus = cudaSetDevice(0);
  78. if (cudaStatus != cudaSuccess) {
  79. fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
  80. goto Error;
  81. }
  82.  
  83. // Allocate GPU buffers for three vectors (two input, one output) .
  84. cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
  85. if (cudaStatus != cudaSuccess) {
  86. fprintf(stderr, "cudaMalloc failed!");
  87. goto Error;
  88. }
  89.  
  90. cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
  91. if (cudaStatus != cudaSuccess) {
  92. fprintf(stderr, "cudaMalloc failed!");
  93. goto Error;
  94. }
  95.  
  96. cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
  97. if (cudaStatus != cudaSuccess) {
  98. fprintf(stderr, "cudaMalloc failed!");
  99. goto Error;
  100. }
  101.  
  102. // Copy input vectors from host memory to GPU buffers.
  103. cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
  104. if (cudaStatus != cudaSuccess) {
  105. fprintf(stderr, "cudaMemcpy failed!");
  106. goto Error;
  107. }
  108.  
  109. cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
  110. if (cudaStatus != cudaSuccess) {
  111. fprintf(stderr, "cudaMemcpy failed!");
  112. goto Error;
  113. }
  114. dim3 threadsPerBlock(2,2);
  115. dim3 numBlocks(size / threadsPerBlock.x, size / threadsPerBlock.y);
  116. // Launch a kernel on the GPU with one thread for each element.
  117. addKernel<<<numBlocks, threadsPerBlock>>>(dev_c, dev_a, dev_b, sizeSingle);
  118.  
  119. // cudaThreadSynchronize waits for the kernel to finish, and returns
  120. // any errors encountered during the launch.
  121. cudaStatus = cudaThreadSynchronize();
  122. if (cudaStatus != cudaSuccess) {
  123. fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
  124. goto Error;
  125. }
  126.  
  127. // Copy output vector from GPU buffer to host memory.
  128. cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
  129. if (cudaStatus != cudaSuccess) {
  130. fprintf(stderr, "cudaMemcpy failed!");
  131. goto Error;
  132. }
  133.  
  134. Error:
  135. cudaFree(dev_c);
  136. cudaFree(dev_a);
  137. cudaFree(dev_b);
  138.  
  139. return cudaStatus;
  140. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement