gpuMatrixSum

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <iomanip>

// Utility Functions
int** matrixAlloc(int rows, int cols);
void matrixFill(int** ptr, int rows, int cols);
void matrixPrint(int** ptr, int rows, int cols);

// cuda kernel
__global__ void gpuMatrixAdd(int** matrix_a, int** matrix_b, int** matrix_c, int rows, int cols);

int main(){

    int** host_a = nullptr;         // Pointers for host matrices
    int** host_b = nullptr;
    int** host_c = nullptr;

    int** dev_a = nullptr;          // Pointers for device matrices
    int** dev_b = nullptr;
    int** dev_c = nullptr;

    int N = 8;                      // matrices width and height

    size_t size = N * N *sizeof(int);

    cudaError_t cudaError;          // CUDA-related variables
    int numBlocks = 1;
    dim3 threadsPerBlock(N, N);

    cudaSetDevice(0);               // selecting first CUDA device (optional)

    host_a = matrixAlloc(N, N);     // allocating matrices on host
    host_b = matrixAlloc(N, N);
    host_c = matrixAlloc(N, N);

    matrixFill(host_a, N, N);   // filling matrices on host
    matrixFill(host_b, N, N);

    // allocating matrices on device
    cudaError = cudaMalloc((void **)&dev_a, size);
    cudaError = cudaMalloc((void **)&dev_b, size);
    cudaError = cudaMalloc((void **)&dev_c, size);

    // copying matrices from host to device
    cudaError = cudaMemcpy(dev_a, host_a, size, cudaMemcpyHostToDevice);
    cudaError = cudaMemcpy(dev_b, host_b, size, cudaMemcpyHostToDevice);

    // running kernel
    gpuMatrixAdd << <numBlocks, threadsPerBlock >> >(dev_a, dev_b, dev_c, N, N);

    // getting matrix c bak to host
    cudaError = cudaMemcpy(host_c, dev_c, size, cudaMemcpyDeviceToHost);
    if (cudaError != cudaSuccess)
        switch (cudaError){
        case cudaErrorInvalidValue:
            std::cout << "cudaErrorInvalidValue\n";
            return -1;

        case cudaErrorInvalidDevicePointer:
            std::cout << "cudaErrorInvaliddevicePointer\n";
            return -1;

        case cudaErrorInvalidMemcpyDirection:
            std::cout << "cudaErrorInvalidMemcpyDirection\n";
            return -1;
        default:
            std::cout << "Unknown value for cudaError\n";
    }
    else
        std::cout << "cudaSuccess\n";

    // printing result
    std::cout << "host_c = \n";
    matrixPrint(host_c, N, N);

    std::cin.get();
    return 0;
}


//
// UTILITY FUNCTIONS
//

int** matrixAlloc(int rows, int cols){
    int** ptr = (int**)malloc(sizeof(int*) * rows);
    for (int i = 0; i < rows; i++){
        ptr[i] = (int*)malloc(sizeof(int) * cols);
    }

    return ptr;
}

void matrixFill(int** ptr, int rows, int cols){
    if (ptr != nullptr)
        for (int i = 0; i < rows; i++)
            for (int j = 0; j < cols; j++)
                ptr[i][j] = i + j;
    else
        std::cout << "matrixFill called with empty pointer.\n";
}

void matrixPrint(int** ptr, int rows, int cols){
    if (ptr != nullptr)
        for (int i = 0; i < rows; i++){
            for (int j = 0; j < cols; j++)
                std::cout << std::setw(4) << ptr[i][j];
            std::cout << std::endl;
        }
}

//
// CUDA Kernel
//
__global__ void gpuMatrixAdd(int** matrix_a, int** matrix_b, int** matrix_c, int rows, int cols){
    int i = threadIdx.x;
    int j = threadIdx.y;
    if (i < rows && j < cols)
        matrix_c[i][j] = matrix_a[i][j] + matrix_b[i][j];
}