Advertisement
Guest User

Untitled

a guest
May 27th, 2019
100
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.69 KB | None | 0 0
  1.  
  2. #include "cuda_runtime.h"
  3. #include "device_launch_parameters.h"
  4.  
  5. #include <stdio.h>
  6. //changed into global variables
  7. const int arraySizeX = 12;
  8. const int arraySizeY = 12;
  9. const int numThreadsPerBlock = 5;
  10. //dividing into correct number of blocks (safe)
  11. int numBlocks = arraySizeX * arraySizeY / numThreadsPerBlock + ((arraySizeX * arraySizeY) % numThreadsPerBlock == 0 ? 0 : 1);
  12.  
  13. cudaError_t addWithCuda(int **c, int **a, int **b, size_t sizeX, size_t sizeY);
  14.  
  15. __global__ void addKernel(int **c, int **a, int **b)
  16. {
  17. int i = threadIdx.x + blockIdx.x*numThreadsPerBlock; //offset for number of blocks
  18. int j = threadIdx.y + blockIdx.y*numThreadsPerBlock; //offset for number of blocks
  19. c[i][j] = a[i][j] + b[i][j];
  20. }
  21.  
  22. int main()
  23. {
  24. int **a;
  25. int **b;
  26. int **c;
  27. //int a[arraySizeX][arraySizeY];
  28. //int b[arraySizeX][arraySizeY];
  29. //int c[arraySizeX][arraySizeY];
  30. int i;
  31. a = (int**)malloc(arraySizeX * sizeof(int*));
  32. for(i = 0; i < arraySizeX; i++)
  33. a[i] = (int*)malloc(arraySizeY * sizeof(int));
  34.  
  35. b = (int**)malloc(arraySizeX * sizeof(int*));
  36. for(i = 0; i < arraySizeX; i++)
  37. b[i] = (int*)malloc(arraySizeY * sizeof(int));
  38. c = (int**)malloc(arraySizeX * sizeof(int*));
  39. for(i = 0; i < arraySizeX; i++)
  40. c[i] = (int*)malloc(arraySizeY * sizeof(int));
  41.  
  42. //init arrays
  43. int j;
  44. int iterator = 1;
  45. for(i = 0; i < arraySizeX; i++)
  46. for(j = 0; j < arraySizeY; j++)
  47. {
  48. a[i][j] = iterator;
  49. b[i][j] = iterator * 10;
  50. c[i][j] = 0;
  51. iterator++;
  52. }
  53.  
  54. // Add vectors in parallel.
  55. cudaError_t cudaStatus = addWithCuda(c, a, b, arraySizeX, arraySizeY);
  56. if (cudaStatus != cudaSuccess) {
  57. fprintf(stderr, "addWithCuda failed!");
  58. return 1;
  59. }
  60.  
  61. for(i = 0; i < arraySizeX; i++)
  62. {
  63. for(j = 0; j < arraySizeY; j++)
  64. {
  65. printf("%d ", c[i][j]);
  66. }
  67. printf("\n");
  68. }
  69. //printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d}\n",
  70. // c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10], c[11]);
  71. system("pause");
  72. // cudaThreadExit must be called before exiting in order for profiling and
  73. // tracing tools such as Parallel Nsight and Visual Profiler to show complete traces.
  74. cudaStatus = cudaThreadExit();
  75. if (cudaStatus != cudaSuccess) {
  76. fprintf(stderr, "cudaThreadExit failed!");
  77. system("pause");
  78. return 1;
  79. }
  80.  
  81. return 0;
  82. }
  83.  
  84. // Helper function for using CUDA to add vectors in parallel.
  85. cudaError_t addWithCuda(int **c, int **a, int **b, size_t sizeX, size_t sizeY)
  86. {
  87. int **dev_a = 0;
  88. int **dev_b = 0;
  89. int **dev_c = 0;
  90. cudaError_t cudaStatus;
  91.  
  92. int i;
  93. cudaMalloc((void**)dev_a, sizeX * sizeof(int*));
  94. for(i = 0; i < sizeX; i++)
  95. cudaMalloc((void**)&dev_a[i], sizeY * sizeof(int));
  96.  
  97.  
  98. cudaMalloc((void**)dev_b, sizeX * sizeof(int*));
  99. for(i = 0; i < sizeX; i++)
  100. cudaMalloc((void**)&dev_b[i], sizeY * sizeof(int));
  101.  
  102. cudaMalloc((void**)dev_c, sizeX * sizeof(int*));
  103. for(i = 0; i < sizeX; i++)
  104. cudaMalloc((void**)&dev_c[i], sizeY * sizeof(int));
  105.  
  106. //dev_b = (int**)malloc(arraySizeX * sizeof(int*));
  107. //for(i = 0; i < arraySizeX; i++)
  108. // dev_b[i] = (int*)malloc(arraySizeY * sizeof(int));
  109. //dev_c = (int**)malloc(arraySizeX * sizeof(int*));
  110. //for(i = 0; i < arraySizeX; i++)
  111. // dev_c[i] = (int*)malloc(arraySizeY * sizeof(int));
  112.  
  113. // Choose which GPU to run on, change this on a multi-GPU system.
  114. cudaStatus = cudaSetDevice(0);
  115. if (cudaStatus != cudaSuccess) {
  116. fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
  117. system("pause");
  118. goto Error;
  119. }
  120.  
  121. // Allocate GPU buffers for three vectors (two input, one output) .
  122. // cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int*));
  123. // if (cudaStatus != cudaSuccess) {
  124. // fprintf(stderr, "cudaMalloc failed!");
  125. //system("pause");
  126. // goto Error;
  127. // }
  128.  
  129. // cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
  130. // if (cudaStatus != cudaSuccess) {
  131. // fprintf(stderr, "cudaMalloc failed!");
  132. //system("pause");
  133. // goto Error;
  134. // }
  135.  
  136. // cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
  137. // if (cudaStatus != cudaSuccess) {
  138. // fprintf(stderr, "cudaMalloc failed!");
  139. //system("pause");
  140. // goto Error;
  141. // }
  142.  
  143. // Copy input vectors from host memory to GPU buffers.
  144. cudaStatus = cudaMemcpy(dev_a, a, sizeX * sizeY * sizeof(int), cudaMemcpyHostToDevice);
  145. if (cudaStatus != cudaSuccess) {
  146. fprintf(stderr, "cudaMemcpy failed!");
  147. system("pause");
  148. goto Error;
  149. }
  150.  
  151. cudaStatus = cudaMemcpy(dev_b, b, sizeX * sizeY * sizeof(int), cudaMemcpyHostToDevice);
  152. if (cudaStatus != cudaSuccess) {
  153. fprintf(stderr, "cudaMemcpy failed!");
  154. system("pause");
  155. goto Error;
  156. }
  157.  
  158. // Launch a kernel on the GPU with one thread for each element.
  159. addKernel<<<numBlocks, numThreadsPerBlock>>>(dev_c, dev_a, dev_b);
  160.  
  161. // cudaThreadSynchronize waits for the kernel to finish, and returns
  162. // any errors encountered during the launch.
  163. cudaStatus = cudaThreadSynchronize();
  164. if (cudaStatus != cudaSuccess) {
  165. fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
  166. system("pause");
  167. goto Error;
  168. }
  169.  
  170. // Copy output vector from GPU buffer to host memory.
  171. cudaStatus = cudaMemcpy(c, dev_c, sizeX * sizeY * sizeof(int), cudaMemcpyDeviceToHost);
  172. if (cudaStatus != cudaSuccess) {
  173. fprintf(stderr, "cudaMemcpy failed!");
  174. system("pause");
  175. goto Error;
  176. }
  177.  
  178. Error:
  179. cudaFree(dev_c);
  180. cudaFree(dev_a);
  181. cudaFree(dev_b);
  182.  
  183. return cudaStatus;
  184. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement