Untitled

#include <cuda_runtime.h>
#include <iostream>
#include <device_launch_parameters.h>
#include "cuda_error_detector.h"
#include <vector>


#ifdef __CUDACC__
#define KERNEL_ARGS2(numerOfBlocks, threadsPerBlock) <<< numerOfBlocks, threadsPerBlock >>>
#define KERNEL_ARGS3(numerOfBlocks, threadsPerBlock, sh_mem) <<< numerOfBlocks, threadsPerBlock, sh_mem >>>
#define KERNEL_ARGS4(numerOfBlocks, threadsPerBlock, sh_mem, stream) <<< numerOfBlocks, threadsPerBlock, sh_mem, stream >>>
#else
#define KERNEL_ARGS2(numerOfBlocks, threadsPerBlock)
#define KERNEL_ARGS3(numerOfBlocks, threadsPerBlock, sh_mem)
#define KERNEL_ARGS4(numerOfBlocks, threadsPerBlock, sh_mem, stream)
#endif

namespace add_loop_blocks
{
	const int number_of_blocks = 100;
	const int threads_per_block = 2;
	const int number_of_threads = number_of_blocks * threads_per_block;
	const int number_of_data = 654321;

	__device__ int get_id()
	{
		int block_id = blockIdx.z * (gridDim.x * gridDim.y)
			+ blockIdx.y * (gridDim.x)
			+ blockIdx.x;
		int threadId = block_id * (blockDim.x * blockDim.y * blockDim.z)
			+ (threadIdx.z * (blockDim.x * blockDim.y))
			+ (threadIdx.y * blockDim.x)
			+ threadIdx.x;
		return threadId;
	}


	__global__ void add(const int* a, const int* b, int *c)
	{
		int id = get_id();
		while (id < number_of_data)
		{
			c[id] = a[id] + b[id];
			id += number_of_threads;
		}
	}

	int main_add_loop_blocks(void)
	{
		const auto sizes = number_of_data * sizeof(int);
		int* a = nullptr;
		int* b = nullptr;
		int* c = nullptr;

		//allocate managed memory, which is used by cpu and gpu
		{
			CudaSafeCall(cudaMallocManaged(&a, sizes));
			CudaSafeCall(cudaMallocManaged(&b, sizes));
			CudaSafeCall(cudaMallocManaged(&c, sizes));
		}

		//assign initial values. cpu uses managed memory.
		for (int i = 0; i < number_of_data; ++i)
		{
			a[i] = i;
			b[i] = i + 1;
		}

		//add. gpu uses managed memory.
		add_loop_blocks::add KERNEL_ARGS2(number_of_blocks, threads_per_block)(a, b, c);

		// wait
		CudaSafeCall(cudaDeviceSynchronize());

// if you want to show all variable, comment out.
//		for (int i = 0; i < number_of_data; ++i)
//		{
//			std::cout << i << " is " << a[i] << "   " << b[i] << "   " << c[i] << "\n";
//		}

		// show only last item.
		const auto i = number_of_data - 1;
		std::cout << i << " is " << a[i] << "   " << b[i] << "   " << c[i] << "\n";

		//free
		{
			CudaSafeCall(cudaFree(a));
			CudaSafeCall(cudaFree(b));
			CudaSafeCall(cudaFree(c));
		}

		return 0;
	}
}


int main()
{
	add_loop_blocks::main_add_loop_blocks();
	return 0;
}