Advertisement
Guest User

Untitled

a guest
Nov 22nd, 2019
104
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.13 KB | None | 0 0
  1. #include "cuda_runtime.h"
  2. #include "device_launch_parameters.h"
  3.  
  4. #include <stdio.h>
  5. #include <ctime>
  6. #include <math.h>
  7.  
  8. cudaError_t addWithCuda(int *c, int *a, int *b, size_t size);
  9.  
  10. __global__ void addKernel(int *c, int *a, int *b, size_t size)
  11. {
  12.  
  13. int i = blockIdx.x * blockDim.x + threadIdx.x;
  14. int j = blockIdx.y * blockDim.y + threadIdx.y;
  15.  
  16. if(i < size && j < size)
  17. {
  18. int index = i + j * size;
  19. c[index] = a[index] + b[index];
  20. }
  21. }
  22.  
  23. int main()
  24. {
  25. srand((unsigned int) time (NULL));
  26. const int N = 2;
  27. int a[N][N];
  28. int b[N][N];
  29. int c[N][N] = { 0 };
  30.  
  31. int i = 0;
  32. int j = 0;
  33. for(i = 0; i < N; ++i)
  34. {
  35. for(j = 0; j < N; ++j)
  36. {
  37. a[i][j] = rand() % 100;
  38. b[i][j] = rand() % 100;
  39. c[i][j] = 0;
  40. }
  41. }
  42.  
  43. // Add vectors in parallel.
  44. cudaError_t cudaStatus = addWithCuda(c[0], a[0], b[0], N);
  45. if (cudaStatus != cudaSuccess) {
  46. fprintf(stderr, "addWithCuda failed!");
  47. return 1;
  48. }
  49.  
  50. for(i = 0; i < N; ++i)
  51. {
  52. for(j = 0; j < N; ++j)
  53. {
  54. printf("%d + %d = %d\n", a[i][j], b[i][j], c[i][j]);
  55. }
  56. }
  57.  
  58. // cudaThreadExit must be called before exiting in order for profiling and
  59. // tracing tools such as Parallel Nsight and Visual Profiler to show complete traces.
  60. cudaStatus = cudaThreadExit();
  61. if (cudaStatus != cudaSuccess) {
  62. fprintf(stderr, "cudaThreadExit failed!");
  63. return 1;
  64. }
  65. system("pause");
  66. return 0;
  67.  
  68. }
  69.  
  70. // Helper function for using CUDA to add vectors in parallel.
  71. cudaError_t addWithCuda(int *c, int *a, int *b, size_t sizeSingle)
  72. {
  73. size_t size = sizeSingle * sizeSingle;
  74. int *dev_a = 0;
  75. int *dev_b = 0;
  76. int *dev_c = 0;
  77. cudaError_t cudaStatus;
  78.  
  79. // Choose which GPU to run on, change this on a multi-GPU system.
  80. cudaStatus = cudaSetDevice(0);
  81. if (cudaStatus != cudaSuccess) {
  82. fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
  83. goto Error;
  84. }
  85.  
  86. // Allocate GPU buffers for three vectors (two input, one output) .
  87. cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
  88. if (cudaStatus != cudaSuccess) {
  89. fprintf(stderr, "cudaMalloc failed!");
  90. goto Error;
  91. }
  92.  
  93. cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
  94. if (cudaStatus != cudaSuccess) {
  95. fprintf(stderr, "cudaMalloc failed!");
  96. goto Error;
  97. }
  98.  
  99. cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
  100. if (cudaStatus != cudaSuccess) {
  101. fprintf(stderr, "cudaMalloc failed!");
  102. goto Error;
  103. }
  104.  
  105. // Copy input vectors from host memory to GPU buffers.
  106. cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
  107. if (cudaStatus != cudaSuccess) {
  108. fprintf(stderr, "cudaMemcpy failed!");
  109. goto Error;
  110. }
  111.  
  112. cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
  113. if (cudaStatus != cudaSuccess) {
  114. fprintf(stderr, "cudaMemcpy failed!");
  115. goto Error;
  116. }
  117.  
  118. dim3 threadsPerBlock(2,2);
  119. dim3 numberOfBlocks(size / threadsPerBlock.x, size / threadsPerBlock.y);
  120. // Launch a kernel on the GPU with one thread for each element.
  121. addKernel<<<numberOfBlocks, threadsPerBlock>>>(dev_c, dev_a, dev_b, sizeSingle);
  122.  
  123. // cudaThreadSynchronize waits for the kernel to finish, and returns
  124. // any errors encountered during the launch.
  125. cudaStatus = cudaThreadSynchronize();
  126. if (cudaStatus != cudaSuccess) {
  127. fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
  128. goto Error;
  129. }
  130.  
  131. // Copy output vector from GPU buffer to host memory.
  132. cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
  133. if (cudaStatus != cudaSuccess) {
  134. fprintf(stderr, "cudaMemcpy failed!");
  135. goto Error;
  136. }
  137.  
  138. Error:
  139. cudaFree(dev_c);
  140. cudaFree(dev_a);
  141. cudaFree(dev_b);
  142.  
  143. return cudaStatus;
  144. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement