Untitled

#include <stdio.h>
#define SIZE    1024

// __global__ -> tell compiler that this function will be called on gpu and is callable from the host
__global__ void VectorAdd(int *a, int *b, int *c, int n)
{
    int i = threadIdx.x; // identifying in which thread function currenly works

    //each element of the vector is independently executed
    // no need for "for loop"
    //for (i = 0; i < n; ++i)
    //  c[i] = a[i] + b[i];

    if(i < n)
        c[i] = a[i] + b[i];
}

int main()
{
    int *a, *b, *c;

    cudaMallocManaged(&a, SIZE * sizeof(int)); // make sure variables are accessible to gpu
    cudaMallocManaged(&b, SIZE * sizeof(int));
    cudaMallocManaged(&c, SIZE * sizeof(int));

    for (int i = 0; i < SIZE; ++i)
    {
        a[i] = i;
        b[i] = i;
        c[i] = 0;
    }

    // specyfying launch configuration of the kernel in <<< >>>
    // <<< number of thread blocks, number of threads in block >>>
    VectorAdd <<<1, SIZE>>>(a, b, c, SIZE);

    // to ensure CPU waits for kernel to complete before continuing
    cudaDeviceSynchronize();

    for (int i = 0; i < 10; ++i)
        printf("c[%d] = %d\n", i, c[i]);

    cudaFree(a);
    cudaFree(b);
    cudaFree(c);

    return 0;
}