Untitled

#include <stdio.h>
#include <cublas_v2.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>
#define N (10 << 50)


__global__ void gSaxpy(float alpha, float *x, float *y) {
	int i = threadIdx.x + blockIdx.x * blockDim.x;
	y[i] = alpha * x[i] + y[i];
}


struct saxpy_functor {
	const float a;
	saxpy_functor(float _a) : a(_a) {}
	__host__ __device__ float operator()(float x, float y) {
		return a * x + y;
	}
};

void saxpy(float a, thrust::device_vector<float>& x, thrust::device_vector<float>& y) {
	saxpy_functor func(a);
	thrust::transform(x.begin(), x.end(), y.begin(), y.begin(), func);
}


int main() {
	float elapsedTime;

	/* Cuda Kernel */
	cudaEvent_t start, stop;
	float *x_d, *x_h, *y_h, *y_d;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaMalloc((void**)&x_d, N * sizeof(float));
	cudaMalloc((void**)&y_d, N * sizeof(float));
	x_h = (float*)calloc(N, sizeof(float));
	y_h = (float*)calloc(N, sizeof(float));

	for(int i = 0; i < N; i++) {
		x_h[i] = i;
		y_h[i] = 0.87;
	}

	cudaMemcpy(x_d, x_h, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(y_d, y_h, N * sizeof(float), cudaMemcpyHostToDevice);

	cudaEventRecord(start, 0);
	gSaxpy <<< N / 256, 256 >>> (3.0, x_d, y_d);
	cudaDeviceSynchronize();
	cudaEventRecord(stop, 0);

	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&elapsedTime, start, stop);
	cudaMemcpy(y_h, y_d, N * sizeof(float), cudaMemcpyDeviceToHost);
	printf("_______________________________\n");
	printf("CUDA Kernel Time:\n \t %f ms\n", elapsedTime);
	free(x_h);
	free(y_h);
	cudaFree(y_d);
	cudaFree(x_d);
	/* Cuda Kernel */


	/* Cuda Cublas */
	float *cx_d, *cx_h, *cy_h, *cy_d;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	cudaMallocHost((void**)&cx_h, N * sizeof(float));
	cudaMallocHost((void**)&cy_h, N * sizeof(float));
	cudaMalloc((void**)&cx_d,  N * sizeof(float));
	cudaMalloc((void**)&cy_d,  N * sizeof(float));

	for(int i = 0; i < N; i++) {
		cx_h[i] = (float) i;
		cy_h[i] = 0.87f;
	}

	cublasHandle_t cublas_handle;
	cublasCreate(&cublas_handle);

	const int num_rows = N;
	const int num_cols = 1;
	const size_t elem_size = sizeof(float);

	cublasSetMatrix(num_rows, num_cols, elem_size, cx_h, num_rows, cx_d, num_rows);
	cublasSetMatrix(num_rows, num_cols, elem_size, cy_h, num_rows, cy_d, num_rows);

	const int stride = 1;
	float alpha = 3.0f;

	cudaEventRecord(start, 0);
	cublasSaxpy(cublas_handle, N, &alpha, cx_d, stride, cy_d, stride);
	cudaEventRecord(stop, 0);

	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&elapsedTime, start, stop);

	cublasGetMatrix(num_rows, num_cols, elem_size, cx_d, num_rows, cx_h, num_rows);
	cublasGetMatrix(num_rows, num_cols, elem_size, cy_d, num_rows, cy_h, num_rows);
	printf("\n_______________________________\n");
	printf("cuBLAS Time:\n \t %f ms\n", elapsedTime);
	cublasDestroy(cublas_handle);
	cudaFreeHost(cx_h);
	cudaFreeHost(cy_h);
	cudaFree(cx_d);
	cudaFree(cy_d);
	/* Cuda Cublas */

	/* Cuda Thrust */
	thrust::host_vector<float> h1(N);
	thrust::host_vector<float> h2(N);
	thrust::sequence(h1.begin(), h1.end());
	thrust::fill(h2.begin(), h2.end(), 0.87);

	thrust::device_vector<float> d1 = h1;
	thrust::device_vector<float> d2 = h2;

	cudaEventRecord(start, 0);
	saxpy(3.0, d1, d2);
	cudaEventRecord(stop, 0);

	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&elapsedTime, start, stop);

	h2 = d2;
	h1 = d1;
	printf("\n_______________________________\n");
	printf("THRUST Time:\n \t %f ms\n\n", elapsedTime);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);

	/* Cuda Thrust */


	return 0;
}