Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <cuda_runtime.h>
- #include <iostream>
- #include <device_launch_parameters.h>
- #include "cuda_error_detector.h"
- #include <vector>
- #ifdef __CUDACC__
- #define KERNEL_ARGS2(numerOfBlocks, threadsPerBlock) <<< numerOfBlocks, threadsPerBlock >>>
- #define KERNEL_ARGS3(numerOfBlocks, threadsPerBlock, sh_mem) <<< numerOfBlocks, threadsPerBlock, sh_mem >>>
- #define KERNEL_ARGS4(numerOfBlocks, threadsPerBlock, sh_mem, stream) <<< numerOfBlocks, threadsPerBlock, sh_mem, stream >>>
- #else
- #define KERNEL_ARGS2(numerOfBlocks, threadsPerBlock)
- #define KERNEL_ARGS3(numerOfBlocks, threadsPerBlock, sh_mem)
- #define KERNEL_ARGS4(numerOfBlocks, threadsPerBlock, sh_mem, stream)
- #endif
- namespace add_loop_blocks
- {
- const int number_of_blocks = 100;
- const int threads_per_block = 2;
- const int number_of_threads = number_of_blocks * threads_per_block;
- const int number_of_data = 654321;
- __device__ int get_id()
- {
- int block_id = blockIdx.z * (gridDim.x * gridDim.y)
- + blockIdx.y * (gridDim.x)
- + blockIdx.x;
- int threadId = block_id * (blockDim.x * blockDim.y * blockDim.z)
- + (threadIdx.z * (blockDim.x * blockDim.y))
- + (threadIdx.y * blockDim.x)
- + threadIdx.x;
- return threadId;
- }
- __global__ void add(const int* a, const int* b, int *c)
- {
- int id = get_id();
- while (id < number_of_data)
- {
- c[id] = a[id] + b[id];
- id += number_of_threads;
- }
- }
- int main_add_loop_blocks(void)
- {
- const auto sizes = number_of_data * sizeof(int);
- int* a = nullptr;
- int* b = nullptr;
- int* c = nullptr;
- //allocate managed memory, which is used by cpu and gpu
- {
- CudaSafeCall(cudaMallocManaged(&a, sizes));
- CudaSafeCall(cudaMallocManaged(&b, sizes));
- CudaSafeCall(cudaMallocManaged(&c, sizes));
- }
- //assign initial values. cpu uses managed memory.
- for (int i = 0; i < number_of_data; ++i)
- {
- a[i] = i;
- b[i] = i + 1;
- }
- //add. gpu uses managed memory.
- add_loop_blocks::add KERNEL_ARGS2(number_of_blocks, threads_per_block)(a, b, c);
- // wait
- CudaSafeCall(cudaDeviceSynchronize());
- // if you want to show all variable, comment out.
- // for (int i = 0; i < number_of_data; ++i)
- // {
- // std::cout << i << " is " << a[i] << " " << b[i] << " " << c[i] << "\n";
- // }
- // show only last item.
- const auto i = number_of_data - 1;
- std::cout << i << " is " << a[i] << " " << b[i] << " " << c[i] << "\n";
- //free
- {
- CudaSafeCall(cudaFree(a));
- CudaSafeCall(cudaFree(b));
- CudaSafeCall(cudaFree(c));
- }
- return 0;
- }
- }
- int main()
- {
- add_loop_blocks::main_add_loop_blocks();
- return 0;
- }
Add Comment
Please, Sign In to add comment