CUDA

// Dawid Mocek (sekcja 15 )
// Alina Litwiak (sekcja 17, chyba)
// Łukasz Sałajczyk (sekcja 15)
//
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

const int arraySize = 2;

cudaError_t addWithCuda(int c[arraySize][arraySize], const int a[arraySize][arraySize], const int b[arraySize][arraySize], size_t size);

__global__ void addKernel(int c[arraySize][arraySize], const int a[arraySize][arraySize], const int b[arraySize][arraySize], const int size)
{
    // Places in the matrix
    int i = threadIdx.x + blockIdx.x * blockDim.x; //
    int j = threadIdx.y + blockIdx.y * blockDim.y; //

    if(i < size && j< size) {
        // Matrix suming
        c[i][j] = a[i][j] + b[i][j];
    }
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int c[arraySize][arraySize], const int a[arraySize][arraySize], const int b[arraySize][arraySize], size_t size)
{
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;
    int blocks;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)
    cudaStatus = cudaMalloc((void**)&dev_c, size * size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copying arrays
    cudaStatus = cudaMemcpy(dev_a, a, size * size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    //Block Size: 5x5
    dim3 blockSize(5, 5);

    // Counting number of blocks
    blocks = (size / 5);
    if(size % 5 != 0) blocks++;

    dim3 numberOfBlocks(blocks,blocks);

    // Core fuctnion
    addKernel<<<numberOfBlocks, blockSize>>>((int(*)[arraySize])dev_c, (int(*)[arraySize])dev_a, (int(*)[arraySize])dev_b,size);

    // cudaThreadSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaThreadSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);

    return cudaStatus;
}

int main()
{

    //A
    int a[arraySize][arraySize] = {{4,5},{6,7}};

    //B
    int b[arraySize][arraySize] = {{9,9999999},{1,2}};

    //Results
    int c[arraySize][arraySize] = { 0 };

    int i, j;

    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }

    //Wyswietlanie macierzy
    printf("Matrix A:\n");
    for(i = 0; i < arraySize; i++){
        for(j = 0; j < arraySize; j++){
            printf("%d ",a[i][j]);
        }
        printf("\n");
    }

    printf("Matrix B:\n");
    for(i = 0; i < arraySize; i++){
        for(j = 0; j < arraySize; j++){
            printf("%d ",b[i][j]);
        }
        printf("\n");
    }

    printf("Rezults:\n");
    for(i = 0; i < arraySize; i++){
        for(j = 0; j < arraySize; j++){
            printf("%d ",c[i][j]);
        }
        printf("\n");
    }

    // cudaThreadExit must be called before exiting in order for profiling and
    // tracing tools such as Parallel Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaThreadExit();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaThreadExit failed!");
        return 1;
    }


    getchar();
    return 0;
}