kuda


#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

const int arraySizeX = 5;
const int arraySizeY = 6;

int block_size = arraySizeX;
int n_blocks = (arraySizeX*arraySizeY) / block_size + ( (arraySizeX*arraySizeY) % block_size == 0 ? 0:1);

cudaError_t addWithCuda(int c[arraySizeY][arraySizeX], const int a[arraySizeY][arraySizeX], const int b[arraySizeY][arraySizeX], size_t sizeX);

__global__ void addKernel(int *c, const int *a, const int *b, int arraySizeX)
{
    // threadIdx.x - nr watku w bloku
    // blockIdx.x - nr kolumny w bloku
    // blockDim.x - ilosc kolumn w bloku
    int column = threadIdx.x + blockIdx.x*blockDim.x;
    int row = threadIdx.y + blockIdx.y*blockDim.y;
    int i = row*arraySizeY+column;
        if(i < arraySizeX*arraySizeY)
        c[i] = a[i] + b[i];
}


int main()
{

    const int a[arraySizeY][arraySizeX] = {{ 1, 2, 3, 4, 5 },
                                        { 21, 22, 23, 24, 25 },
                                        { 31, 32, 33, 34, 35 },
                                        { 41, 42, 43, 44, 45 },
                                        { 51, 52, 53, 54, 55 },
                                        { 61, 62, 63, 64, 65 }};
    const int b[arraySizeY][arraySizeX] = {{ 10, 20, 30, 40, 50 },
                                        { 210, 220, 230, 240, 250 },
                                        { 310, 320, 330, 340, 350 },
                                        { 410, 420, 430, 440, 450 },
                                        { 510, 520, 530, 540, 550 },
                                        { 610, 620, 630, 640, 650 }};
    int c[arraySizeY][arraySizeX] = { 0 };

    // Add vectors in parallel.
    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySizeX);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }

    printf("{ 1, 2, 3, 4, 5} + { 10, 20, 30, 40, 50} = { %d, %d, %d, %d, %d}\n",
        c[0][0], c[0][1], c[0][2], c[0][3], c[0][4]);
    printf("{21,22,23,24,25} + {210,220,230,240,250} = {%d,%d,%d,%d,%d}\n",
        c[1][0], c[1][1], c[1][2], c[1][3], c[1][4]);
    printf("{31,32,33,34,35} + {310,320,330,340,350} = {%d,%d,%d,%d,%d}\n",
        c[2][0], c[2][1], c[2][2], c[2][3], c[2][4]);
    printf("{41,42,43,44,45} + {410,420,430,440,450} = {%d,%d,%d,%d,%d}\n",
        c[3][0], c[3][1], c[3][2], c[3][3], c[3][4]);
    printf("{51,52,53,54,55} + {510,520,530,540,550} = {%d,%d,%d,%d,%d}\n",
        c[4][0], c[4][1], c[4][2], c[4][3], c[4][4]);
    printf("{61,62,63,64,65} + {610,620,630,640,650} = {%d,%d,%d,%d,%d}\n",
        c[5][0], c[5][1], c[5][2], c[5][3], c[5][4]);

    // cudaThreadExit must be called before exiting in order for profiling and
    // tracing tools such as Parallel Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaThreadExit();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaThreadExit failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int c[arraySizeY][arraySizeX], const int a[arraySizeY][arraySizeX], const int b[arraySizeY][arraySizeX], size_t sizeX)
{
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, sizeX * arraySizeY * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, sizeX * arraySizeY * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, sizeX * arraySizeY * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, sizeX * arraySizeY * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, sizeX * arraySizeY * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<n_blocks, block_size>>>(dev_c, dev_a, dev_b, sizeX); // liczba blokow zalezna od rozmiaru wektoru/macierzy

    // cudaThreadSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaThreadSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, sizeX* arraySizeY * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);

    return cudaStatus;
}