Advertisement
Guest User

kuda

a guest
May 27th, 2019
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 5.25 KB | None | 0 0
  1.  
  2. #include "cuda_runtime.h"
  3. #include "device_launch_parameters.h"
  4. #include <stdio.h>
  5.  
  6. const int arraySizeX = 5;
  7. const int arraySizeY = 6;
  8.  
  9. int block_size = arraySizeX;
  10. int n_blocks = (arraySizeX*arraySizeY) / block_size + ( (arraySizeX*arraySizeY) % block_size == 0 ? 0:1);
  11.  
  12. cudaError_t addWithCuda(int c[arraySizeY][arraySizeX], const int a[arraySizeY][arraySizeX], const int b[arraySizeY][arraySizeX], size_t sizeX);
  13.  
  14. __global__ void addKernel(int *c, const int *a, const int *b, int arraySizeX)
  15. {
  16.     // threadIdx.x - nr watku w bloku
  17.     // blockIdx.x - nr kolumny w bloku
  18.     // blockDim.x - ilosc kolumn w bloku
  19.     int column = threadIdx.x + blockIdx.x*blockDim.x;
  20.     int row = threadIdx.y + blockIdx.y*blockDim.y;
  21.     int i = row*arraySizeY+column;
  22.         if(i < arraySizeX*arraySizeY)
  23.         c[i] = a[i] + b[i];
  24. }
  25.  
  26.  
  27. int main()
  28. {
  29.  
  30.     const int a[arraySizeY][arraySizeX] = {{ 1, 2, 3, 4, 5 },
  31.                                         { 21, 22, 23, 24, 25 },
  32.                                         { 31, 32, 33, 34, 35 },
  33.                                         { 41, 42, 43, 44, 45 },
  34.                                         { 51, 52, 53, 54, 55 },
  35.                                         { 61, 62, 63, 64, 65 }};
  36.     const int b[arraySizeY][arraySizeX] = {{ 10, 20, 30, 40, 50 },
  37.                                         { 210, 220, 230, 240, 250 },
  38.                                         { 310, 320, 330, 340, 350 },
  39.                                         { 410, 420, 430, 440, 450 },
  40.                                         { 510, 520, 530, 540, 550 },
  41.                                         { 610, 620, 630, 640, 650 }};
  42.     int c[arraySizeY][arraySizeX] = { 0 };
  43.  
  44.     // Add vectors in parallel.
  45.     cudaError_t cudaStatus = addWithCuda(c, a, b, arraySizeX);
  46.     if (cudaStatus != cudaSuccess) {
  47.         fprintf(stderr, "addWithCuda failed!");
  48.         return 1;
  49.     }
  50.  
  51.     printf("{ 1, 2, 3, 4, 5} + { 10, 20, 30, 40, 50} = { %d, %d, %d, %d, %d}\n",
  52.         c[0][0], c[0][1], c[0][2], c[0][3], c[0][4]);
  53.     printf("{21,22,23,24,25} + {210,220,230,240,250} = {%d,%d,%d,%d,%d}\n",
  54.         c[1][0], c[1][1], c[1][2], c[1][3], c[1][4]);
  55.     printf("{31,32,33,34,35} + {310,320,330,340,350} = {%d,%d,%d,%d,%d}\n",
  56.         c[2][0], c[2][1], c[2][2], c[2][3], c[2][4]);
  57.     printf("{41,42,43,44,45} + {410,420,430,440,450} = {%d,%d,%d,%d,%d}\n",
  58.         c[3][0], c[3][1], c[3][2], c[3][3], c[3][4]);
  59.     printf("{51,52,53,54,55} + {510,520,530,540,550} = {%d,%d,%d,%d,%d}\n",
  60.         c[4][0], c[4][1], c[4][2], c[4][3], c[4][4]);
  61.     printf("{61,62,63,64,65} + {610,620,630,640,650} = {%d,%d,%d,%d,%d}\n",
  62.         c[5][0], c[5][1], c[5][2], c[5][3], c[5][4]);
  63.  
  64.     // cudaThreadExit must be called before exiting in order for profiling and
  65.     // tracing tools such as Parallel Nsight and Visual Profiler to show complete traces.
  66.     cudaStatus = cudaThreadExit();
  67.     if (cudaStatus != cudaSuccess) {
  68.         fprintf(stderr, "cudaThreadExit failed!");
  69.         return 1;
  70.     }
  71.  
  72.     return 0;
  73. }
  74.  
  75. // Helper function for using CUDA to add vectors in parallel.
  76. cudaError_t addWithCuda(int c[arraySizeY][arraySizeX], const int a[arraySizeY][arraySizeX], const int b[arraySizeY][arraySizeX], size_t sizeX)
  77. {
  78.     int *dev_a = 0;
  79.     int *dev_b = 0;
  80.     int *dev_c = 0;
  81.     cudaError_t cudaStatus;
  82.  
  83.     // Choose which GPU to run on, change this on a multi-GPU system.
  84.     cudaStatus = cudaSetDevice(0);
  85.     if (cudaStatus != cudaSuccess) {
  86.         fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
  87.         goto Error;
  88.     }
  89.  
  90.     // Allocate GPU buffers for three vectors (two input, one output)    .
  91.     cudaStatus = cudaMalloc((void**)&dev_c, sizeX * arraySizeY * sizeof(int));
  92.     if (cudaStatus != cudaSuccess) {
  93.         fprintf(stderr, "cudaMalloc failed!");
  94.         goto Error;
  95.     }
  96.  
  97.     cudaStatus = cudaMalloc((void**)&dev_a, sizeX * arraySizeY * sizeof(int));
  98.     if (cudaStatus != cudaSuccess) {
  99.         fprintf(stderr, "cudaMalloc failed!");
  100.         goto Error;
  101.     }
  102.  
  103.     cudaStatus = cudaMalloc((void**)&dev_b, sizeX * arraySizeY * sizeof(int));
  104.     if (cudaStatus != cudaSuccess) {
  105.         fprintf(stderr, "cudaMalloc failed!");
  106.         goto Error;
  107.     }
  108.  
  109.     // Copy input vectors from host memory to GPU buffers.
  110.     cudaStatus = cudaMemcpy(dev_a, a, sizeX * arraySizeY * sizeof(int), cudaMemcpyHostToDevice);
  111.     if (cudaStatus != cudaSuccess) {
  112.         fprintf(stderr, "cudaMemcpy failed!");
  113.         goto Error;
  114.     }
  115.  
  116.     cudaStatus = cudaMemcpy(dev_b, b, sizeX * arraySizeY * sizeof(int), cudaMemcpyHostToDevice);
  117.     if (cudaStatus != cudaSuccess) {
  118.         fprintf(stderr, "cudaMemcpy failed!");
  119.         goto Error;
  120.     }
  121.  
  122.     // Launch a kernel on the GPU with one thread for each element.
  123.     addKernel<<<n_blocks, block_size>>>(dev_c, dev_a, dev_b, sizeX); // liczba blokow zalezna od rozmiaru wektoru/macierzy
  124.  
  125.     // cudaThreadSynchronize waits for the kernel to finish, and returns
  126.     // any errors encountered during the launch.
  127.     cudaStatus = cudaThreadSynchronize();
  128.     if (cudaStatus != cudaSuccess) {
  129.         fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
  130.         goto Error;
  131.     }
  132.  
  133.     // Copy output vector from GPU buffer to host memory.
  134.     cudaStatus = cudaMemcpy(c, dev_c, sizeX* arraySizeY * sizeof(int), cudaMemcpyDeviceToHost);
  135.     if (cudaStatus != cudaSuccess) {
  136.         fprintf(stderr, "cudaMemcpy failed!");
  137.         goto Error;
  138.     }
  139.  
  140. Error:
  141.     cudaFree(dev_c);
  142.     cudaFree(dev_a);
  143.     cudaFree(dev_b);
  144.    
  145.     return cudaStatus;
  146. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement