Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <cublas_v2.h>
- #include <thrust/host_vector.h>
- #include <thrust/device_vector.h>
- #include <thrust/transform.h>
- #include <thrust/fill.h>
- #include <thrust/sequence.h>
- #define N (10 << 50)
- __global__ void gSaxpy(float alpha, float *x, float *y) {
- int i = threadIdx.x + blockIdx.x * blockDim.x;
- y[i] = alpha * x[i] + y[i];
- }
- struct saxpy_functor {
- const float a;
- saxpy_functor(float _a) : a(_a) {}
- __host__ __device__ float operator()(float x, float y) {
- return a * x + y;
- }
- };
- void saxpy(float a, thrust::device_vector<float>& x, thrust::device_vector<float>& y) {
- saxpy_functor func(a);
- thrust::transform(x.begin(), x.end(), y.begin(), y.begin(), func);
- }
- int main() {
- float elapsedTime;
- /* Cuda Kernel */
- cudaEvent_t start, stop;
- float *x_d, *x_h, *y_h, *y_d;
- cudaEventCreate(&start);
- cudaEventCreate(&stop);
- cudaMalloc((void**)&x_d, N * sizeof(float));
- cudaMalloc((void**)&y_d, N * sizeof(float));
- x_h = (float*)calloc(N, sizeof(float));
- y_h = (float*)calloc(N, sizeof(float));
- for(int i = 0; i < N; i++) {
- x_h[i] = i;
- y_h[i] = 0.87;
- }
- cudaMemcpy(x_d, x_h, N * sizeof(float), cudaMemcpyHostToDevice);
- cudaMemcpy(y_d, y_h, N * sizeof(float), cudaMemcpyHostToDevice);
- cudaEventRecord(start, 0);
- gSaxpy <<< N / 256, 256 >>> (3.0, x_d, y_d);
- cudaDeviceSynchronize();
- cudaEventRecord(stop, 0);
- cudaEventSynchronize(stop);
- cudaEventElapsedTime(&elapsedTime, start, stop);
- cudaMemcpy(y_h, y_d, N * sizeof(float), cudaMemcpyDeviceToHost);
- printf("_______________________________\n");
- printf("CUDA Kernel Time:\n \t %f ms\n", elapsedTime);
- free(x_h);
- free(y_h);
- cudaFree(y_d);
- cudaFree(x_d);
- /* Cuda Kernel */
- /* Cuda Cublas */
- float *cx_d, *cx_h, *cy_h, *cy_d;
- cudaEventCreate(&start);
- cudaEventCreate(&stop);
- cudaMallocHost((void**)&cx_h, N * sizeof(float));
- cudaMallocHost((void**)&cy_h, N * sizeof(float));
- cudaMalloc((void**)&cx_d, N * sizeof(float));
- cudaMalloc((void**)&cy_d, N * sizeof(float));
- for(int i = 0; i < N; i++) {
- cx_h[i] = (float) i;
- cy_h[i] = 0.87f;
- }
- cublasHandle_t cublas_handle;
- cublasCreate(&cublas_handle);
- const int num_rows = N;
- const int num_cols = 1;
- const size_t elem_size = sizeof(float);
- cublasSetMatrix(num_rows, num_cols, elem_size, cx_h, num_rows, cx_d, num_rows);
- cublasSetMatrix(num_rows, num_cols, elem_size, cy_h, num_rows, cy_d, num_rows);
- const int stride = 1;
- float alpha = 3.0f;
- cudaEventRecord(start, 0);
- cublasSaxpy(cublas_handle, N, &alpha, cx_d, stride, cy_d, stride);
- cudaEventRecord(stop, 0);
- cudaEventSynchronize(stop);
- cudaEventElapsedTime(&elapsedTime, start, stop);
- cublasGetMatrix(num_rows, num_cols, elem_size, cx_d, num_rows, cx_h, num_rows);
- cublasGetMatrix(num_rows, num_cols, elem_size, cy_d, num_rows, cy_h, num_rows);
- printf("\n_______________________________\n");
- printf("cuBLAS Time:\n \t %f ms\n", elapsedTime);
- cublasDestroy(cublas_handle);
- cudaFreeHost(cx_h);
- cudaFreeHost(cy_h);
- cudaFree(cx_d);
- cudaFree(cy_d);
- /* Cuda Cublas */
- /* Cuda Thrust */
- thrust::host_vector<float> h1(N);
- thrust::host_vector<float> h2(N);
- thrust::sequence(h1.begin(), h1.end());
- thrust::fill(h2.begin(), h2.end(), 0.87);
- thrust::device_vector<float> d1 = h1;
- thrust::device_vector<float> d2 = h2;
- cudaEventRecord(start, 0);
- saxpy(3.0, d1, d2);
- cudaEventRecord(stop, 0);
- cudaEventSynchronize(stop);
- cudaEventElapsedTime(&elapsedTime, start, stop);
- h2 = d2;
- h1 = d1;
- printf("\n_______________________________\n");
- printf("THRUST Time:\n \t %f ms\n\n", elapsedTime);
- cudaEventDestroy(start);
- cudaEventDestroy(stop);
- /* Cuda Thrust */
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement