Untitled


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

cudaError_t addWithCuda(int *c, int *a, int *b, size_t size);

__global__ void addKernel(int *c, int *a, int *b, size_t size)
{
    //int i = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.y + threadIdx.x;
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	int j = blockIdx.y * blockDim.y + threadIdx.y;

	if(i < size && j < size)
	{
		int index = i + j * size;
	    c[index] = a[index] + b[index];
	}
}

int main()
{
	const int arraySize = 5;
    int a[arraySize][arraySize];
    int b[arraySize][arraySize];
    int c[arraySize][arraySize] = { 0 };

	int i = 0;
	int j = 0;
	for(i = 0; i < arraySize; ++i)
	{
		for(j = 0; j < arraySize; ++j)
		{
			a[i][j] = 10 * i + j;
			b[i][j] = 10 * (10 * i + j);
			c[i][j] = 0;
		}
	}

    // Add vectors in parallel.
    cudaError_t cudaStatus = addWithCuda(c[0], a[0], b[0], arraySize);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }

	for(i = 0; i < arraySize; ++i)
	{
		for(j = 0; j < arraySize; ++j)
		{
			printf("%d + %d = %d\n", a[i][j], b[i][j], c[i][j]);
		}
	}

	// cudaThreadExit must be called before exiting in order for profiling and
	// tracing tools such as Parallel Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaThreadExit();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaThreadExit failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, int *a, int *b, size_t sizeSingle)
{
	size_t size = sizeSingle * sizeSingle;
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
	dim3 threadsPerBlock(2,2);
	dim3 numBlocks(size / threadsPerBlock.x, size / threadsPerBlock.y);
    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<numBlocks, threadsPerBlock>>>(dev_c, dev_a, dev_b, sizeSingle);

    // cudaThreadSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaThreadSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);

    return cudaStatus;
}