Untitled

#include <cuda.h>
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>
#include <cublas_v2.h>
#include <iostream>
#include <ctime>
#include <time.h>

using namespace std;

struct SaxpyFunctor {
	const float a;

	SaxpyFunctor(const float a) : a(a) {}

	__host__ __device__ float operator ()(float x, float y) {
		return a * x + y;
	}
};

__global__ void saxpyKernel(float a, float *x, float *y) {
	int i = threadIdx.x + blockIdx.x * blockDim.x;
	y[i] = a * x[i] + y[i];
}

int main() {
	const float a = 100;
	int t;
	cin>>t;
	const int n = 1 << t;
	const int threadsPerBlock = 1 << 10;

	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	float ms;
	float *array = new float[n];
	for(int i = 0; i < n; i++)
		array[i] = i;

	float *X = new float [n];
	float *Y = new float [n];
	for(int i = 0; i < n; i++)
		X[i] = i;

	// PURE CUDA
	float *deviceX, *deviceY;
	cudaMalloc((void**)&deviceX, n * sizeof(float));
	cudaMalloc((void**)&deviceY, n * sizeof(float));

	cudaMemcpy(deviceX, array, sizeof(float) * n, cudaMemcpyHostToDevice);
	cudaMemcpy(deviceY, array, sizeof(float) * n, cudaMemcpyHostToDevice);

	cudaEventRecord(start);
	saxpyKernel<<<n / threadsPerBlock, threadsPerBlock>>>(a, deviceX, deviceY);
	cudaEventRecord(stop);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&ms, start, stop);
	printf("Pure cuda: %f\n", ms);

	cudaMemcpy(array, deviceY, sizeof(float) * n, cudaMemcpyDeviceToHost);

	// THRUST
	thrust::device_vector<float> thrustX(n);
	thrust::device_vector<float> thrustY(n);

	thrust::sequence(thrustX.begin(), thrustX.end());
	thrust::sequence(thrustY.begin(), thrustY.end());

	cudaEventRecord(start);
	thrust::transform(thrustX.begin(), thrustX.end(), thrustY.begin(), thrustY.begin(), SaxpyFunctor(a));
	cudaEventRecord(stop);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&ms, start, stop);
	printf("Thrust: %f\n", ms);

	//thrust::copy(thrustY.begin(), thrustY.begin() + 10, std::ostream_iterator<float>(std::cout, "\n"));

	// CUBLAS
	cublasHandle_t cublasHandle;
	cublasCreate(&cublasHandle);

	for(int i = 0; i < n; i++)
		array[i] = i;

	const int stride = 1;
	cublasSetVector(n, sizeof(float), array, stride, deviceX, stride);
	cublasSetVector(n, sizeof(float), array, stride, deviceY, stride);

	cudaEventRecord(start);
	cublasSaxpy(cublasHandle, n, &a, deviceX, stride, deviceY, stride);
	cudaEventRecord(stop);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&ms, start, stop);
	printf("cuBLAS: %f\n", ms);

	cublasGetVector(n, sizeof(float), deviceY, stride, array, stride);
	cublasDestroy(cublasHandle);

	/*for(int i = 0; i < 10; i++) {
		printf("%f\n", array[i]);
	}*/

	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	cudaFree(deviceX);
	cudaFree(deviceY);

	return 0;
}