Untitled

# (Approximate command-line.  Settings inherited from host are not visible below.)
# (Please see the output window after a build for the full command-line)

# Driver API (NVCC Compilation Type is .cubin, .gpu, or .ptx)
set CUDAFE_FLAGS=--sdk_dir "C:\Program Files (x86)\Windows Kits\8.1\"
"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5\bin\nvcc.exe" --use-local-env --cl-version 2013 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin"  -I"D:\Software\Dropbox\Skola\Master's Courses\DH2323 Computer Graphics and Interaction\Project\CUDA_udacity\CUDA_udacity\include"   -G   --keep-dir Debug -maxrregcount=0  --machine 32 --compile -cudart static  -o Debug\%(Filename)%(Extension).obj "%(FullPath)"

# Runtime API (NVCC Compilation Type is hybrid object or .c file)
set CUDAFE_FLAGS=--sdk_dir "C:\Program Files (x86)\Windows Kits\8.1\"
"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5\bin\nvcc.exe" --use-local-env --cl-version 2013 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin"  -I"D:\Software\Dropbox\Skola\Master's Courses\DH2323 Computer Graphics and Interaction\Project\CUDA_udacity\CUDA_udacity\include"   -G   --keep-dir Debug -maxrregcount=0  --machine 32 --compile -cudart static  -g    -Xcompiler "/EHsc  /nologo /Od /Zi   " -o Debug\%(Filename)%(Extension).obj "%(FullPath)"


//source code below
//test.cu
#include "test.h"

template <typename T>
__global__ void add(T* d_out, const T* d_a, const T* d_b)
{
	/* Adds arrays d_a and d_b and outputs the result to array d_out.
	*
	*/

	int i = threadIdx.x;

	d_out[i] = d_a[i] + d_b[i];
}

//test.h
#include <stdio.h>
#include <iostream>
#include <time.h>

#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

template <typename T>
__global__ void add(T* d_out, const T* d_a, const T* d_b);

//main.cu
#include <stdio.h>
#include <iostream>
#include <time.h>

#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include "test.h"

using namespace std;

int main(int argc, char **argv)
{
	const int ARRAY_SIZE = 1024;
	const int ARRAY_BYTES = sizeof(float) * ARRAY_SIZE;

	float h_a[ARRAY_SIZE];
	float h_b[ARRAY_SIZE];

	for (int i = 0; i < ARRAY_SIZE; i++)
	{
		h_a[i] = i / 2.0f;
		h_b[i] = (i + 1) / 3.0f;
	}

	float h_out[ARRAY_SIZE];

	float* d_a;
	float* d_b;
	float* d_out;

	//clock_t t;

	//t = clock();
	cudaMalloc(&d_a, ARRAY_BYTES);
	cudaMalloc(&d_b, ARRAY_BYTES);
	cudaMalloc(&d_out, ARRAY_BYTES);

	cudaMemcpy(d_a, h_a, ARRAY_BYTES, cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, h_b, ARRAY_BYTES, cudaMemcpyHostToDevice);

	add <<<1, ARRAY_SIZE>>>(d_out, d_a, d_b);

	cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);

	//cout << clock() - t << endl;

	std::cin.get();
	return 0;
}