Advertisement
Guest User

gpuMatrixSum

a guest
Nov 27th, 2014
135
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 3.13 KB | None | 0 0
  1. #include "cuda_runtime.h"
  2. #include "device_launch_parameters.h"
  3.  
  4. #include <iostream>
  5. #include <iomanip>
  6.  
  7. // Utility Functions
  8. int** matrixAlloc(int rows, int cols);
  9. void matrixFill(int** ptr, int rows, int cols);
  10. void matrixPrint(int** ptr, int rows, int cols);
  11.  
  12. // cuda kernel
  13. __global__ void gpuMatrixAdd(int** matrix_a, int** matrix_b, int** matrix_c, int rows, int cols);
  14.  
  15. int main(){
  16.  
  17.     int** host_a = nullptr;         // Pointers for host matrices
  18.     int** host_b = nullptr;
  19.     int** host_c = nullptr;
  20.  
  21.     int** dev_a = nullptr;          // Pointers for device matrices
  22.     int** dev_b = nullptr;
  23.     int** dev_c = nullptr;
  24.  
  25.     int N = 8;                      // matrices width and height
  26.  
  27.     size_t size = N * N *sizeof(int);
  28.  
  29.     cudaError_t cudaError;          // CUDA-related variables
  30.     int numBlocks = 1;
  31.     dim3 threadsPerBlock(N, N);
  32.  
  33.     cudaSetDevice(0);               // selecting first CUDA device (optional)
  34.  
  35.     host_a = matrixAlloc(N, N);     // allocating matrices on host
  36.     host_b = matrixAlloc(N, N);
  37.     host_c = matrixAlloc(N, N);
  38.  
  39.     matrixFill(host_a, N, N);   // filling matrices on host
  40.     matrixFill(host_b, N, N);
  41.  
  42.     // allocating matrices on device
  43.     cudaError = cudaMalloc((void **)&dev_a, size);
  44.     cudaError = cudaMalloc((void **)&dev_b, size);
  45.     cudaError = cudaMalloc((void **)&dev_c, size);
  46.  
  47.     // copying matrices from host to device
  48.     cudaError = cudaMemcpy(dev_a, host_a, size, cudaMemcpyHostToDevice);
  49.     cudaError = cudaMemcpy(dev_b, host_b, size, cudaMemcpyHostToDevice);
  50.  
  51.     // running kernel
  52.     gpuMatrixAdd << <numBlocks, threadsPerBlock >> >(dev_a, dev_b, dev_c, N, N);
  53.  
  54.     // getting matrix c bak to host
  55.     cudaError = cudaMemcpy(host_c, dev_c, size, cudaMemcpyDeviceToHost);
  56.     if (cudaError != cudaSuccess)
  57.         switch (cudaError){
  58.         case cudaErrorInvalidValue:
  59.             std::cout << "cudaErrorInvalidValue\n";
  60.             return -1;
  61.  
  62.         case cudaErrorInvalidDevicePointer:
  63.             std::cout << "cudaErrorInvaliddevicePointer\n";
  64.             return -1;
  65.  
  66.         case cudaErrorInvalidMemcpyDirection:
  67.             std::cout << "cudaErrorInvalidMemcpyDirection\n";
  68.             return -1;
  69.         default:
  70.             std::cout << "Unknown value for cudaError\n";
  71.     }
  72.     else
  73.         std::cout << "cudaSuccess\n";
  74.  
  75.     // printing result
  76.     std::cout << "host_c = \n";
  77.     matrixPrint(host_c, N, N);
  78.  
  79.     std::cin.get();
  80.     return 0;
  81. }
  82.  
  83.  
  84. //
  85. // UTILITY FUNCTIONS
  86. //
  87.  
  88. int** matrixAlloc(int rows, int cols){
  89.     int** ptr = (int**)malloc(sizeof(int*) * rows);
  90.     for (int i = 0; i < rows; i++){
  91.         ptr[i] = (int*)malloc(sizeof(int) * cols);
  92.     }
  93.  
  94.     return ptr;
  95. }
  96.  
  97. void matrixFill(int** ptr, int rows, int cols){
  98.     if (ptr != nullptr)
  99.         for (int i = 0; i < rows; i++)
  100.             for (int j = 0; j < cols; j++)
  101.                 ptr[i][j] = i + j;
  102.     else
  103.         std::cout << "matrixFill called with empty pointer.\n";
  104. }
  105.  
  106. void matrixPrint(int** ptr, int rows, int cols){
  107.     if (ptr != nullptr)
  108.         for (int i = 0; i < rows; i++){
  109.             for (int j = 0; j < cols; j++)
  110.                 std::cout << std::setw(4) << ptr[i][j];
  111.             std::cout << std::endl;
  112.         }
  113. }
  114.  
  115. //
  116. // CUDA Kernel
  117. //
  118. __global__ void gpuMatrixAdd(int** matrix_a, int** matrix_b, int** matrix_c, int rows, int cols){
  119.     int i = threadIdx.x;
  120.     int j = threadIdx.y;
  121.     if (i < rows && j < cols)
  122.         matrix_c[i][j] = matrix_a[i][j] + matrix_b[i][j];
  123. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement