Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #define SIZE 1024
- // __global__ -> tell compiler that this function will be called on gpu and is callable from the host
- __global__ void VectorAdd(int *a, int *b, int *c, int n)
- {
- int i = threadIdx.x; // identifying in which thread function currenly works
- //each element of the vector is independently executed
- // no need for "for loop"
- //for (i = 0; i < n; ++i)
- // c[i] = a[i] + b[i];
- if(i < n)
- c[i] = a[i] + b[i];
- }
- int main()
- {
- int *a, *b, *c;
- cudaMallocManaged(&a, SIZE * sizeof(int)); // make sure variables are accessible to gpu
- cudaMallocManaged(&b, SIZE * sizeof(int));
- cudaMallocManaged(&c, SIZE * sizeof(int));
- for (int i = 0; i < SIZE; ++i)
- {
- a[i] = i;
- b[i] = i;
- c[i] = 0;
- }
- // specyfying launch configuration of the kernel in <<< >>>
- // <<< number of thread blocks, number of threads in block >>>
- VectorAdd <<<1, SIZE>>>(a, b, c, SIZE);
- // to ensure CPU waits for kernel to complete before continuing
- cudaDeviceSynchronize();
- for (int i = 0; i < 10; ++i)
- printf("c[%d] = %d\n", i, c[i]);
- cudaFree(a);
- cudaFree(b);
- cudaFree(c);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement