Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "cuda_runtime.h"
- #include "device_launch_parameters.h"
- #include <stdio.h>
- const int arraySizeX = 5;
- const int arraySizeY = 6;
- int block_size = arraySizeX;
- int n_blocks = (arraySizeX*arraySizeY) / block_size + ( (arraySizeX*arraySizeY) % block_size == 0 ? 0:1);
- cudaError_t addWithCuda(int c[arraySizeY][arraySizeX], const int a[arraySizeY][arraySizeX], const int b[arraySizeY][arraySizeX], size_t sizeX);
- __global__ void addKernel(int *c, const int *a, const int *b, int arraySizeX)
- {
- // threadIdx.x - nr watku w bloku
- // blockIdx.x - nr kolumny w bloku
- // blockDim.x - ilosc kolumn w bloku
- int column = threadIdx.x + blockIdx.x*blockDim.x;
- int row = threadIdx.y + blockIdx.y*blockDim.y;
- int i = row*arraySizeY+column;
- if(i < arraySizeX*arraySizeY)
- c[i] = a[i] + b[i];
- }
- int main()
- {
- const int a[arraySizeY][arraySizeX] = {{ 1, 2, 3, 4, 5 },
- { 21, 22, 23, 24, 25 },
- { 31, 32, 33, 34, 35 },
- { 41, 42, 43, 44, 45 },
- { 51, 52, 53, 54, 55 },
- { 61, 62, 63, 64, 65 }};
- const int b[arraySizeY][arraySizeX] = {{ 10, 20, 30, 40, 50 },
- { 210, 220, 230, 240, 250 },
- { 310, 320, 330, 340, 350 },
- { 410, 420, 430, 440, 450 },
- { 510, 520, 530, 540, 550 },
- { 610, 620, 630, 640, 650 }};
- int c[arraySizeY][arraySizeX] = { 0 };
- // Add vectors in parallel.
- cudaError_t cudaStatus = addWithCuda(c, a, b, arraySizeX);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "addWithCuda failed!");
- return 1;
- }
- printf("{ 1, 2, 3, 4, 5} + { 10, 20, 30, 40, 50} = { %d, %d, %d, %d, %d}\n",
- c[0][0], c[0][1], c[0][2], c[0][3], c[0][4]);
- printf("{21,22,23,24,25} + {210,220,230,240,250} = {%d,%d,%d,%d,%d}\n",
- c[1][0], c[1][1], c[1][2], c[1][3], c[1][4]);
- printf("{31,32,33,34,35} + {310,320,330,340,350} = {%d,%d,%d,%d,%d}\n",
- c[2][0], c[2][1], c[2][2], c[2][3], c[2][4]);
- printf("{41,42,43,44,45} + {410,420,430,440,450} = {%d,%d,%d,%d,%d}\n",
- c[3][0], c[3][1], c[3][2], c[3][3], c[3][4]);
- printf("{51,52,53,54,55} + {510,520,530,540,550} = {%d,%d,%d,%d,%d}\n",
- c[4][0], c[4][1], c[4][2], c[4][3], c[4][4]);
- printf("{61,62,63,64,65} + {610,620,630,640,650} = {%d,%d,%d,%d,%d}\n",
- c[5][0], c[5][1], c[5][2], c[5][3], c[5][4]);
- // cudaThreadExit must be called before exiting in order for profiling and
- // tracing tools such as Parallel Nsight and Visual Profiler to show complete traces.
- cudaStatus = cudaThreadExit();
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaThreadExit failed!");
- return 1;
- }
- return 0;
- }
- // Helper function for using CUDA to add vectors in parallel.
- cudaError_t addWithCuda(int c[arraySizeY][arraySizeX], const int a[arraySizeY][arraySizeX], const int b[arraySizeY][arraySizeX], size_t sizeX)
- {
- int *dev_a = 0;
- int *dev_b = 0;
- int *dev_c = 0;
- cudaError_t cudaStatus;
- // Choose which GPU to run on, change this on a multi-GPU system.
- cudaStatus = cudaSetDevice(0);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
- goto Error;
- }
- // Allocate GPU buffers for three vectors (two input, one output) .
- cudaStatus = cudaMalloc((void**)&dev_c, sizeX * arraySizeY * sizeof(int));
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMalloc failed!");
- goto Error;
- }
- cudaStatus = cudaMalloc((void**)&dev_a, sizeX * arraySizeY * sizeof(int));
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMalloc failed!");
- goto Error;
- }
- cudaStatus = cudaMalloc((void**)&dev_b, sizeX * arraySizeY * sizeof(int));
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMalloc failed!");
- goto Error;
- }
- // Copy input vectors from host memory to GPU buffers.
- cudaStatus = cudaMemcpy(dev_a, a, sizeX * arraySizeY * sizeof(int), cudaMemcpyHostToDevice);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMemcpy failed!");
- goto Error;
- }
- cudaStatus = cudaMemcpy(dev_b, b, sizeX * arraySizeY * sizeof(int), cudaMemcpyHostToDevice);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMemcpy failed!");
- goto Error;
- }
- // Launch a kernel on the GPU with one thread for each element.
- addKernel<<<n_blocks, block_size>>>(dev_c, dev_a, dev_b, sizeX); // liczba blokow zalezna od rozmiaru wektoru/macierzy
- // cudaThreadSynchronize waits for the kernel to finish, and returns
- // any errors encountered during the launch.
- cudaStatus = cudaThreadSynchronize();
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
- goto Error;
- }
- // Copy output vector from GPU buffer to host memory.
- cudaStatus = cudaMemcpy(c, dev_c, sizeX* arraySizeY * sizeof(int), cudaMemcpyDeviceToHost);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMemcpy failed!");
- goto Error;
- }
- Error:
- cudaFree(dev_c);
- cudaFree(dev_a);
- cudaFree(dev_b);
- return cudaStatus;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement