Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <cuda.h>
- #include <stdio.h>
- #include <thrust/host_vector.h>
- #include <thrust/device_vector.h>
- #include <thrust/transform.h>
- #include <thrust/fill.h>
- #include <thrust/sequence.h>
- #include <cublas_v2.h>
- #include <iostream>
- #include <ctime>
- #include <time.h>
- using namespace std;
- struct SaxpyFunctor {
- const float a;
- SaxpyFunctor(const float a) : a(a) {}
- __host__ __device__ float operator ()(float x, float y) {
- return a * x + y;
- }
- };
- __global__ void saxpyKernel(float a, float *x, float *y) {
- int i = threadIdx.x + blockIdx.x * blockDim.x;
- y[i] = a * x[i] + y[i];
- }
- int main() {
- const float a = 100;
- int t;
- cin>>t;
- const int n = 1 << t;
- const int threadsPerBlock = 1 << 10;
- cudaEvent_t start, stop;
- cudaEventCreate(&start);
- cudaEventCreate(&stop);
- float ms;
- float *array = new float[n];
- for(int i = 0; i < n; i++)
- array[i] = i;
- float *X = new float [n];
- float *Y = new float [n];
- for(int i = 0; i < n; i++)
- X[i] = i;
- // PURE CUDA
- float *deviceX, *deviceY;
- cudaMalloc((void**)&deviceX, n * sizeof(float));
- cudaMalloc((void**)&deviceY, n * sizeof(float));
- cudaMemcpy(deviceX, array, sizeof(float) * n, cudaMemcpyHostToDevice);
- cudaMemcpy(deviceY, array, sizeof(float) * n, cudaMemcpyHostToDevice);
- cudaEventRecord(start);
- saxpyKernel<<<n / threadsPerBlock, threadsPerBlock>>>(a, deviceX, deviceY);
- cudaEventRecord(stop);
- cudaEventSynchronize(stop);
- cudaEventElapsedTime(&ms, start, stop);
- printf("Pure cuda: %f\n", ms);
- cudaMemcpy(array, deviceY, sizeof(float) * n, cudaMemcpyDeviceToHost);
- // THRUST
- thrust::device_vector<float> thrustX(n);
- thrust::device_vector<float> thrustY(n);
- thrust::sequence(thrustX.begin(), thrustX.end());
- thrust::sequence(thrustY.begin(), thrustY.end());
- cudaEventRecord(start);
- thrust::transform(thrustX.begin(), thrustX.end(), thrustY.begin(), thrustY.begin(), SaxpyFunctor(a));
- cudaEventRecord(stop);
- cudaEventSynchronize(stop);
- cudaEventElapsedTime(&ms, start, stop);
- printf("Thrust: %f\n", ms);
- //thrust::copy(thrustY.begin(), thrustY.begin() + 10, std::ostream_iterator<float>(std::cout, "\n"));
- // CUBLAS
- cublasHandle_t cublasHandle;
- cublasCreate(&cublasHandle);
- for(int i = 0; i < n; i++)
- array[i] = i;
- const int stride = 1;
- cublasSetVector(n, sizeof(float), array, stride, deviceX, stride);
- cublasSetVector(n, sizeof(float), array, stride, deviceY, stride);
- cudaEventRecord(start);
- cublasSaxpy(cublasHandle, n, &a, deviceX, stride, deviceY, stride);
- cudaEventRecord(stop);
- cudaEventSynchronize(stop);
- cudaEventElapsedTime(&ms, start, stop);
- printf("cuBLAS: %f\n", ms);
- cublasGetVector(n, sizeof(float), deviceY, stride, array, stride);
- cublasDestroy(cublasHandle);
- /*for(int i = 0; i < 10; i++) {
- printf("%f\n", array[i]);
- }*/
- cudaEventDestroy(start);
- cudaEventDestroy(stop);
- cudaFree(deviceX);
- cudaFree(deviceY);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement